diff --git a/.cirrus.tasks.yml b/.cirrus.tasks.yml index 92057006c9309..1a366975d824f 100644 --- a/.cirrus.tasks.yml +++ b/.cirrus.tasks.yml @@ -938,14 +938,11 @@ task: # - Don't use ccache, the files are uncacheable, polluting ccache's # cache # - Use -fmax-errors, as particularly cpluspluscheck can be very verbose - # - XXX have to disable ICU to avoid errors: - # https://postgr.es/m/20220323002024.f2g6tivduzrktgfa%40alap3.anarazel.de ### always: headers_headerscheck_script: | time ./configure \ ${LINUX_CONFIGURE_FEATURES} \ - --without-icu \ --quiet \ CC="gcc" CXX"=g++" CLANG="clang-16" make -s -j${BUILD_JOBS} clean diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 8048afd1a80fa..f83e2fc658664 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -14,6 +14,15 @@ # # $ git log --pretty=format:"%H # %cd%n# %s" $PGINDENTGITHASH -1 --date=iso +1d1612aec7688139e1a5506df1366b4b6a69605d # 2025-07-29 09:10:41 -0400 +# Run pgindent. + +73873805fb3627cb23937c750fa83ffd8f16fc6c # 2025-07-25 16:36:44 -0400 +# Run pgindent on the changes of the previous patch. + +9e345415bcd3c4358350b89edfd710469b8bfaf9 # 2025-07-01 15:23:07 +0200 +# Fix indentation in pg_numa code + b27644bade0348d0dafd3036c47880a349fe9332 # 2025-06-15 13:04:24 -0400 # Sync typedefs.list with the buildfarm. diff --git a/config/llvm.m4 b/config/llvm.m4 index fa4bedd9370fc..9d6fe8199e364 100644 --- a/config/llvm.m4 +++ b/config/llvm.m4 @@ -4,7 +4,7 @@ # ----------------- # # Look for the LLVM installation, check that it's new enough, set the -# corresponding LLVM_{CFLAGS,CXXFLAGS,BINPATH} and LDFLAGS +# corresponding LLVM_{CFLAGS,CXXFLAGS,BINPATH,LIBS} # variables. Also verify that CLANG is available, to transform C # into bitcode. # @@ -55,7 +55,7 @@ AC_DEFUN([PGAC_LLVM_SUPPORT], for pgac_option in `$LLVM_CONFIG --ldflags`; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LLVM_LIBS="$LLVM_LIBS $pgac_option";; esac done diff --git a/config/programs.m4 b/config/programs.m4 index 0ad1e58b48d6b..e57fe4907b844 100644 --- a/config/programs.m4 +++ b/config/programs.m4 @@ -284,20 +284,26 @@ AC_DEFUN([PGAC_CHECK_STRIP], AC_DEFUN([PGAC_CHECK_LIBCURL], [ + # libcurl compiler/linker flags are kept separate from the global flags, so + # they have to be added back temporarily for the following tests. + pgac_save_CPPFLAGS=$CPPFLAGS + pgac_save_LDFLAGS=$LDFLAGS + pgac_save_LIBS=$LIBS + + CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS" + LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS" + AC_CHECK_HEADER(curl/curl.h, [], [AC_MSG_ERROR([header file is required for --with-libcurl])]) + + # LIBCURL_LDLIBS is determined here. Like the compiler flags, it should not + # pollute the global LIBS setting. AC_CHECK_LIB(curl, curl_multi_init, [ AC_DEFINE([HAVE_LIBCURL], [1], [Define to 1 if you have the `curl' library (-lcurl).]) AC_SUBST(LIBCURL_LDLIBS, -lcurl) ], [AC_MSG_ERROR([library 'curl' does not provide curl_multi_init])]) - pgac_save_CPPFLAGS=$CPPFLAGS - pgac_save_LDFLAGS=$LDFLAGS - pgac_save_LIBS=$LIBS - - CPPFLAGS="$LIBCURL_CPPFLAGS $CPPFLAGS" - LDFLAGS="$LIBCURL_LDFLAGS $LDFLAGS" LIBS="$LIBCURL_LDLIBS $LIBS" # Check to see whether the current platform supports threadsafe Curl diff --git a/configure b/configure index 16ef5b58d1a87..507a2437c3308 100755 --- a/configure +++ b/configure @@ -5194,7 +5194,7 @@ fi for pgac_option in `$LLVM_CONFIG --ldflags`; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LLVM_LIBS="$LLVM_LIBS $pgac_option";; esac done @@ -9436,12 +9436,12 @@ fi # Note the user could also set XML2_CFLAGS/XML2_LIBS directly for pgac_option in $XML2_CFLAGS; do case $pgac_option in - -I*|-D*) CPPFLAGS="$CPPFLAGS $pgac_option";; + -I*|-D*) INCLUDES="$INCLUDES $pgac_option";; esac done for pgac_option in $XML2_LIBS; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LIBDIRS="$LIBDIRS $pgac_option";; esac done fi @@ -9666,12 +9666,12 @@ fi # note that -llz4 will be added by AC_CHECK_LIB below. for pgac_option in $LZ4_CFLAGS; do case $pgac_option in - -I*|-D*) CPPFLAGS="$CPPFLAGS $pgac_option";; + -I*|-D*) INCLUDES="$INCLUDES $pgac_option";; esac done for pgac_option in $LZ4_LIBS; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LIBDIRS="$LIBDIRS $pgac_option";; esac done fi @@ -9807,12 +9807,12 @@ fi # note that -lzstd will be added by AC_CHECK_LIB below. for pgac_option in $ZSTD_CFLAGS; do case $pgac_option in - -I*|-D*) CPPFLAGS="$CPPFLAGS $pgac_option";; + -I*|-D*) INCLUDES="$INCLUDES $pgac_option";; esac done for pgac_option in $ZSTD_LIBS; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LIBDIRS="$LIBDIRS $pgac_option";; esac done fi @@ -12717,6 +12717,15 @@ fi if test "$with_libcurl" = yes ; then + # libcurl compiler/linker flags are kept separate from the global flags, so + # they have to be added back temporarily for the following tests. + pgac_save_CPPFLAGS=$CPPFLAGS + pgac_save_LDFLAGS=$LDFLAGS + pgac_save_LIBS=$LIBS + + CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS" + LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS" + ac_fn_c_check_header_mongrel "$LINENO" "curl/curl.h" "ac_cv_header_curl_curl_h" "$ac_includes_default" if test "x$ac_cv_header_curl_curl_h" = xyes; then : @@ -12725,6 +12734,9 @@ else fi + + # LIBCURL_LDLIBS is determined here. Like the compiler flags, it should not + # pollute the global LIBS setting. { $as_echo "$as_me:${as_lineno-$LINENO}: checking for curl_multi_init in -lcurl" >&5 $as_echo_n "checking for curl_multi_init in -lcurl... " >&6; } if ${ac_cv_lib_curl_curl_multi_init+:} false; then : @@ -12774,12 +12786,6 @@ else fi - pgac_save_CPPFLAGS=$CPPFLAGS - pgac_save_LDFLAGS=$LDFLAGS - pgac_save_LIBS=$LIBS - - CPPFLAGS="$LIBCURL_CPPFLAGS $CPPFLAGS" - LDFLAGS="$LIBCURL_LDFLAGS $LDFLAGS" LIBS="$LIBCURL_LDLIBS $LIBS" # Check to see whether the current platform supports threadsafe Curl @@ -13309,6 +13315,23 @@ fi fi +if test "$with_liburing" = yes; then + _LIBS="$LIBS" + LIBS="$LIBURING_LIBS $LIBS" + for ac_func in io_uring_queue_init_mem +do : + ac_fn_c_check_func "$LINENO" "io_uring_queue_init_mem" "ac_cv_func_io_uring_queue_init_mem" +if test "x$ac_cv_func_io_uring_queue_init_mem" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_IO_URING_QUEUE_INIT_MEM 1 +_ACEOF + +fi +done + + LIBS="$_LIBS" +fi + if test "$with_lz4" = yes ; then { $as_echo "$as_me:${as_lineno-$LINENO}: checking for LZ4_compress_default in -llz4" >&5 $as_echo_n "checking for LZ4_compress_default in -llz4... " >&6; } @@ -16635,7 +16658,7 @@ fi if test "$with_icu" = yes; then ac_save_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$ICU_CFLAGS $CPPFLAGS" + CPPFLAGS="$CPPFLAGS $ICU_CFLAGS" # Verify we have ICU's header files ac_fn_c_check_header_mongrel "$LINENO" "unicode/ucol.h" "ac_cv_header_unicode_ucol_h" "$ac_includes_default" @@ -17542,7 +17565,7 @@ $as_echo "#define HAVE_GCC__ATOMIC_INT64_CAS 1" >>confdefs.h fi -# Check for x86 cpuid instruction +# Check for __get_cpuid() and __cpuid() { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid" >&5 $as_echo_n "checking for __get_cpuid... " >&6; } if ${pgac_cv__get_cpuid+:} false; then : @@ -17575,77 +17598,79 @@ if test x"$pgac_cv__get_cpuid" = x"yes"; then $as_echo "#define HAVE__GET_CPUID 1" >>confdefs.h -fi - -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid_count" >&5 -$as_echo_n "checking for __get_cpuid_count... " >&6; } -if ${pgac_cv__get_cpuid_count+:} false; then : +else + # __cpuid() + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuid" >&5 +$as_echo_n "checking for __cpuid... " >&6; } +if ${pgac_cv__cpuid+:} false; then : $as_echo_n "(cached) " >&6 else cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ -#include +#include int main () { unsigned int exx[4] = {0, 0, 0, 0}; - __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + __cpuid(exx, 1); ; return 0; } _ACEOF if ac_fn_c_try_link "$LINENO"; then : - pgac_cv__get_cpuid_count="yes" + pgac_cv__cpuid="yes" else - pgac_cv__get_cpuid_count="no" + pgac_cv__cpuid="no" fi rm -f core conftest.err conftest.$ac_objext \ conftest$ac_exeext conftest.$ac_ext fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__get_cpuid_count" >&5 -$as_echo "$pgac_cv__get_cpuid_count" >&6; } -if test x"$pgac_cv__get_cpuid_count" = x"yes"; then +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuid" >&5 +$as_echo "$pgac_cv__cpuid" >&6; } + if test x"$pgac_cv__cpuid" = x"yes"; then -$as_echo "#define HAVE__GET_CPUID_COUNT 1" >>confdefs.h +$as_echo "#define HAVE__CPUID 1" >>confdefs.h + fi fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuid" >&5 -$as_echo_n "checking for __cpuid... " >&6; } -if ${pgac_cv__cpuid+:} false; then : +# Check for __get_cpuid_count() and __cpuidex() in a similar fashion. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid_count" >&5 +$as_echo_n "checking for __get_cpuid_count... " >&6; } +if ${pgac_cv__get_cpuid_count+:} false; then : $as_echo_n "(cached) " >&6 else cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ -#include +#include int main () { unsigned int exx[4] = {0, 0, 0, 0}; - __get_cpuid(exx[0], 1); + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); ; return 0; } _ACEOF if ac_fn_c_try_link "$LINENO"; then : - pgac_cv__cpuid="yes" + pgac_cv__get_cpuid_count="yes" else - pgac_cv__cpuid="no" + pgac_cv__get_cpuid_count="no" fi rm -f core conftest.err conftest.$ac_objext \ conftest$ac_exeext conftest.$ac_ext fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuid" >&5 -$as_echo "$pgac_cv__cpuid" >&6; } -if test x"$pgac_cv__cpuid" = x"yes"; then - -$as_echo "#define HAVE__CPUID 1" >>confdefs.h +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__get_cpuid_count" >&5 +$as_echo "$pgac_cv__get_cpuid_count" >&6; } +if test x"$pgac_cv__get_cpuid_count" = x"yes"; then -fi +$as_echo "#define HAVE__GET_CPUID_COUNT 1" >>confdefs.h -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5 +else + # __cpuidex() + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5 $as_echo_n "checking for __cpuidex... " >&6; } if ${pgac_cv__cpuidex+:} false; then : $as_echo_n "(cached) " >&6 @@ -17657,7 +17682,7 @@ int main () { unsigned int exx[4] = {0, 0, 0, 0}; - __get_cpuidex(exx[0], 7, 0); + __cpuidex(exx, 7, 0); ; return 0; @@ -17673,10 +17698,11 @@ rm -f core conftest.err conftest.$ac_objext \ fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuidex" >&5 $as_echo "$pgac_cv__cpuidex" >&6; } -if test x"$pgac_cv__cpuidex" = x"yes"; then + if test x"$pgac_cv__cpuidex" = x"yes"; then $as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h + fi fi # Check for XSAVE intrinsics @@ -18853,7 +18879,7 @@ Use --without-tcl to disable building PL/Tcl." "$LINENO" 5 fi # now that we have TCL_INCLUDE_SPEC, we can check for ac_save_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$TCL_INCLUDE_SPEC $CPPFLAGS" + CPPFLAGS="$CPPFLAGS $TCL_INCLUDE_SPEC" ac_fn_c_check_header_mongrel "$LINENO" "tcl.h" "ac_cv_header_tcl_h" "$ac_includes_default" if test "x$ac_cv_header_tcl_h" = xyes; then : @@ -18922,7 +18948,7 @@ fi # check for if test "$with_python" = yes; then ac_save_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$python_includespec $CPPFLAGS" + CPPFLAGS="$CPPFLAGS $python_includespec" ac_fn_c_check_header_mongrel "$LINENO" "Python.h" "ac_cv_header_Python_h" "$ac_includes_default" if test "x$ac_cv_header_Python_h" = xyes; then : diff --git a/configure.ac b/configure.ac index b3efc49c97a9d..5f4548adc5cd1 100644 --- a/configure.ac +++ b/configure.ac @@ -1103,12 +1103,12 @@ if test "$with_libxml" = yes ; then # Note the user could also set XML2_CFLAGS/XML2_LIBS directly for pgac_option in $XML2_CFLAGS; do case $pgac_option in - -I*|-D*) CPPFLAGS="$CPPFLAGS $pgac_option";; + -I*|-D*) INCLUDES="$INCLUDES $pgac_option";; esac done for pgac_option in $XML2_LIBS; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LIBDIRS="$LIBDIRS $pgac_option";; esac done fi @@ -1152,12 +1152,12 @@ if test "$with_lz4" = yes; then # note that -llz4 will be added by AC_CHECK_LIB below. for pgac_option in $LZ4_CFLAGS; do case $pgac_option in - -I*|-D*) CPPFLAGS="$CPPFLAGS $pgac_option";; + -I*|-D*) INCLUDES="$INCLUDES $pgac_option";; esac done for pgac_option in $LZ4_LIBS; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LIBDIRS="$LIBDIRS $pgac_option";; esac done fi @@ -1177,12 +1177,12 @@ if test "$with_zstd" = yes; then # note that -lzstd will be added by AC_CHECK_LIB below. for pgac_option in $ZSTD_CFLAGS; do case $pgac_option in - -I*|-D*) CPPFLAGS="$CPPFLAGS $pgac_option";; + -I*|-D*) INCLUDES="$INCLUDES $pgac_option";; esac done for pgac_option in $ZSTD_LIBS; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LIBDIRS="$LIBDIRS $pgac_option";; esac done fi @@ -1420,6 +1420,13 @@ if test "$with_libxslt" = yes ; then AC_CHECK_LIB(xslt, xsltCleanupGlobals, [], [AC_MSG_ERROR([library 'xslt' is required for XSLT support])]) fi +if test "$with_liburing" = yes; then + _LIBS="$LIBS" + LIBS="$LIBURING_LIBS $LIBS" + AC_CHECK_FUNCS([io_uring_queue_init_mem]) + LIBS="$_LIBS" +fi + if test "$with_lz4" = yes ; then AC_CHECK_LIB(lz4, LZ4_compress_default, [], [AC_MSG_ERROR([library 'lz4' is required for LZ4 support])]) fi @@ -1937,7 +1944,7 @@ fi if test "$with_icu" = yes; then ac_save_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$ICU_CFLAGS $CPPFLAGS" + CPPFLAGS="$CPPFLAGS $ICU_CFLAGS" # Verify we have ICU's header files AC_CHECK_HEADER(unicode/ucol.h, [], @@ -2037,7 +2044,7 @@ PGAC_HAVE_GCC__ATOMIC_INT32_CAS PGAC_HAVE_GCC__ATOMIC_INT64_CAS -# Check for x86 cpuid instruction +# Check for __get_cpuid() and __cpuid() AC_CACHE_CHECK([for __get_cpuid], [pgac_cv__get_cpuid], [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], [[unsigned int exx[4] = {0, 0, 0, 0}; @@ -2047,8 +2054,21 @@ AC_CACHE_CHECK([for __get_cpuid], [pgac_cv__get_cpuid], [pgac_cv__get_cpuid="no"])]) if test x"$pgac_cv__get_cpuid" = x"yes"; then AC_DEFINE(HAVE__GET_CPUID, 1, [Define to 1 if you have __get_cpuid.]) +else + # __cpuid() + AC_CACHE_CHECK([for __cpuid], [pgac_cv__cpuid], + [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [[unsigned int exx[4] = {0, 0, 0, 0}; + __cpuid(exx, 1); + ]])], + [pgac_cv__cpuid="yes"], + [pgac_cv__cpuid="no"])]) + if test x"$pgac_cv__cpuid" = x"yes"; then + AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.]) + fi fi +# Check for __get_cpuid_count() and __cpuidex() in a similar fashion. AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count], [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], [[unsigned int exx[4] = {0, 0, 0, 0}; @@ -2058,28 +2078,18 @@ AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count], [pgac_cv__get_cpuid_count="no"])]) if test x"$pgac_cv__get_cpuid_count" = x"yes"; then AC_DEFINE(HAVE__GET_CPUID_COUNT, 1, [Define to 1 if you have __get_cpuid_count.]) -fi - -AC_CACHE_CHECK([for __cpuid], [pgac_cv__cpuid], -[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], - [[unsigned int exx[4] = {0, 0, 0, 0}; - __get_cpuid(exx[0], 1); - ]])], - [pgac_cv__cpuid="yes"], - [pgac_cv__cpuid="no"])]) -if test x"$pgac_cv__cpuid" = x"yes"; then - AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.]) -fi - -AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex], -[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], - [[unsigned int exx[4] = {0, 0, 0, 0}; - __get_cpuidex(exx[0], 7, 0); - ]])], - [pgac_cv__cpuidex="yes"], - [pgac_cv__cpuidex="no"])]) -if test x"$pgac_cv__cpuidex" = x"yes"; then - AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.]) +else + # __cpuidex() + AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex], + [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [[unsigned int exx[4] = {0, 0, 0, 0}; + __cpuidex(exx, 7, 0); + ]])], + [pgac_cv__cpuidex="yes"], + [pgac_cv__cpuidex="no"])]) + if test x"$pgac_cv__cpuidex" = x"yes"; then + AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.]) + fi fi # Check for XSAVE intrinsics @@ -2337,7 +2347,7 @@ Use --without-tcl to disable building PL/Tcl.]) fi # now that we have TCL_INCLUDE_SPEC, we can check for ac_save_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$TCL_INCLUDE_SPEC $CPPFLAGS" + CPPFLAGS="$CPPFLAGS $TCL_INCLUDE_SPEC" AC_CHECK_HEADER(tcl.h, [], [AC_MSG_ERROR([header file is required for Tcl])]) CPPFLAGS=$ac_save_CPPFLAGS fi @@ -2374,7 +2384,7 @@ fi # check for if test "$with_python" = yes; then ac_save_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$python_includespec $CPPFLAGS" + CPPFLAGS="$CPPFLAGS $python_includespec" AC_CHECK_HEADER(Python.h, [], [AC_MSG_ERROR([header file is required for Python])]) CPPFLAGS=$ac_save_CPPFLAGS fi diff --git a/contrib/amcheck/expected/check_btree.out b/contrib/amcheck/expected/check_btree.out index c6f4b16c55615..6558f2c5a4ff4 100644 --- a/contrib/amcheck/expected/check_btree.out +++ b/contrib/amcheck/expected/check_btree.out @@ -60,6 +60,14 @@ SELECT bt_index_parent_check('bttest_a_brin_idx'); ERROR: expected "btree" index as targets for verification DETAIL: Relation "bttest_a_brin_idx" is a brin index. ROLLBACK; +-- verify partitioned indexes are rejected (error) +BEGIN; +CREATE TABLE bttest_partitioned (a int, b int) PARTITION BY list (a); +CREATE INDEX bttest_btree_partitioned_idx ON bttest_partitioned USING btree (b); +SELECT bt_index_parent_check('bttest_btree_partitioned_idx'); +ERROR: expected index as targets for verification +DETAIL: This operation is not supported for partitioned indexes. +ROLLBACK; -- normal check outside of xact SELECT bt_index_check('bttest_a_idx'); bt_index_check diff --git a/contrib/amcheck/sql/check_btree.sql b/contrib/amcheck/sql/check_btree.sql index 0793dbfeebd82..171f7f691ec60 100644 --- a/contrib/amcheck/sql/check_btree.sql +++ b/contrib/amcheck/sql/check_btree.sql @@ -52,6 +52,13 @@ CREATE INDEX bttest_a_brin_idx ON bttest_a USING brin(id); SELECT bt_index_parent_check('bttest_a_brin_idx'); ROLLBACK; +-- verify partitioned indexes are rejected (error) +BEGIN; +CREATE TABLE bttest_partitioned (a int, b int) PARTITION BY list (a); +CREATE INDEX bttest_btree_partitioned_idx ON bttest_partitioned USING btree (b); +SELECT bt_index_parent_check('bttest_btree_partitioned_idx'); +ROLLBACK; + -- normal check outside of xact SELECT bt_index_check('bttest_a_idx'); -- more expansive tests diff --git a/contrib/amcheck/verify_common.c b/contrib/amcheck/verify_common.c index d095e62ce551f..a31ce06ed99a3 100644 --- a/contrib/amcheck/verify_common.c +++ b/contrib/amcheck/verify_common.c @@ -18,11 +18,13 @@ #include "verify_common.h" #include "catalog/index.h" #include "catalog/pg_am.h" +#include "commands/defrem.h" #include "commands/tablecmds.h" #include "utils/guc.h" #include "utils/syscache.h" static bool amcheck_index_mainfork_expected(Relation rel); +static bool index_checkable(Relation rel, Oid am_id); /* @@ -155,23 +157,21 @@ amcheck_lock_relation_and_check(Oid indrelid, * callable by non-superusers. If granted, it's useful to be able to check a * whole cluster. */ -bool +static bool index_checkable(Relation rel, Oid am_id) { - if (rel->rd_rel->relkind != RELKIND_INDEX || - rel->rd_rel->relam != am_id) - { - HeapTuple amtup; - HeapTuple amtuprel; + if (rel->rd_rel->relkind != RELKIND_INDEX) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("expected index as targets for verification"), + errdetail_relkind_not_supported(rel->rd_rel->relkind))); - amtup = SearchSysCache1(AMOID, ObjectIdGetDatum(am_id)); - amtuprel = SearchSysCache1(AMOID, ObjectIdGetDatum(rel->rd_rel->relam)); + if (rel->rd_rel->relam != am_id) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("expected \"%s\" index as targets for verification", NameStr(((Form_pg_am) GETSTRUCT(amtup))->amname)), + errmsg("expected \"%s\" index as targets for verification", get_am_name(am_id)), errdetail("Relation \"%s\" is a %s index.", - RelationGetRelationName(rel), NameStr(((Form_pg_am) GETSTRUCT(amtuprel))->amname)))); - } + RelationGetRelationName(rel), get_am_name(rel->rd_rel->relam)))); if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, @@ -182,7 +182,7 @@ index_checkable(Relation rel, Oid am_id) if (!rel->rd_index->indisvalid) ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cannot check index \"%s\"", RelationGetRelationName(rel)), errdetail("Index is not valid."))); diff --git a/contrib/amcheck/verify_common.h b/contrib/amcheck/verify_common.h index e78adb68808f0..3f4c57f963d6b 100644 --- a/contrib/amcheck/verify_common.h +++ b/contrib/amcheck/verify_common.h @@ -16,8 +16,7 @@ #include "utils/relcache.h" #include "miscadmin.h" -/* Typedefs for callback functions for amcheck_lock_relation_and_check */ -typedef void (*IndexCheckableCallback) (Relation index); +/* Typedef for callback function for amcheck_lock_relation_and_check */ typedef void (*IndexDoCheckCallback) (Relation rel, Relation heaprel, void *state, @@ -27,5 +26,3 @@ extern void amcheck_lock_relation_and_check(Oid indrelid, Oid am_id, IndexDoCheckCallback check, LOCKMODE lockmode, void *state); - -extern bool index_checkable(Relation rel, Oid am_id); diff --git a/contrib/amcheck/verify_heapam.c b/contrib/amcheck/verify_heapam.c index aa9cccd1da4fe..4963e9245cb54 100644 --- a/contrib/amcheck/verify_heapam.c +++ b/contrib/amcheck/verify_heapam.c @@ -1942,7 +1942,7 @@ check_tuple(HeapCheckContext *ctx, bool *xmin_commit_status_ok, if (RelationGetDescr(ctx->rel)->natts < ctx->natts) { report_corruption(ctx, - psprintf("number of attributes %u exceeds maximum expected for table %u", + psprintf("number of attributes %u exceeds maximum %u expected for table", ctx->natts, RelationGetDescr(ctx->rel)->natts)); return; diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index f11c43a0ed797..0949c88983ac2 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -913,7 +913,7 @@ bt_report_duplicate(BtreeCheckState *state, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index uniqueness is violated for index \"%s\"", RelationGetRelationName(state->rel)), - errdetail("Index %s%s and%s%s (point to heap %s and %s) page lsn=%X/%X.", + errdetail("Index %s%s and%s%s (point to heap %s and %s) page lsn=%X/%08X.", itid, pposting, nitid, pnposting, htid, nhtid, LSN_FORMAT_ARGS(state->targetlsn)))); } @@ -1058,7 +1058,7 @@ bt_leftmost_ignoring_half_dead(BtreeCheckState *state, (errcode(ERRCODE_NO_DATA), errmsg_internal("harmless interrupted page deletion detected in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Block=%u right block=%u page lsn=%X/%X.", + errdetail_internal("Block=%u right block=%u page lsn=%X/%08X.", reached, reached_from, LSN_FORMAT_ARGS(pagelsn)))); @@ -1283,7 +1283,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("wrong number of high key index tuple attributes in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Index block=%u natts=%u block type=%s page lsn=%X/%X.", + errdetail_internal("Index block=%u natts=%u block type=%s page lsn=%X/%08X.", state->targetblock, BTreeTupleGetNAtts(itup, state->rel), P_ISLEAF(topaque) ? "heap" : "index", @@ -1332,7 +1332,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index tuple size does not equal lp_len in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=(%u,%u) tuple size=%zu lp_len=%u page lsn=%X/%X.", + errdetail_internal("Index tid=(%u,%u) tuple size=%zu lp_len=%u page lsn=%X/%08X.", state->targetblock, offset, tupsize, ItemIdGetLength(itemid), LSN_FORMAT_ARGS(state->targetlsn)), @@ -1356,7 +1356,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("wrong number of index tuple attributes in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=%s natts=%u points to %s tid=%s page lsn=%X/%X.", + errdetail_internal("Index tid=%s natts=%u points to %s tid=%s page lsn=%X/%08X.", itid, BTreeTupleGetNAtts(itup, state->rel), P_ISLEAF(topaque) ? "heap" : "index", @@ -1406,7 +1406,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("could not find tuple using search from root page in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.", + errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%08X.", itid, htid, LSN_FORMAT_ARGS(state->targetlsn)))); } @@ -1435,7 +1435,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("posting list contains misplaced TID in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=%s posting list offset=%d page lsn=%X/%X.", + errdetail_internal("Index tid=%s posting list offset=%d page lsn=%X/%08X.", itid, i, LSN_FORMAT_ARGS(state->targetlsn)))); } @@ -1488,7 +1488,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index row size %zu exceeds maximum for index \"%s\"", tupsize, RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.", + errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%08X.", itid, P_ISLEAF(topaque) ? "heap" : "index", htid, @@ -1595,7 +1595,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("high key invariant violated for index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.", + errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%08X.", itid, P_ISLEAF(topaque) ? "heap" : "index", htid, @@ -1641,9 +1641,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("item order invariant violated for index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Lower index tid=%s (points to %s tid=%s) " - "higher index tid=%s (points to %s tid=%s) " - "page lsn=%X/%X.", + errdetail_internal("Lower index tid=%s (points to %s tid=%s) higher index tid=%s (points to %s tid=%s) page lsn=%X/%08X.", itid, P_ISLEAF(topaque) ? "heap" : "index", htid, @@ -1760,7 +1758,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("cross page item order invariant violated for index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Last item on page tid=(%u,%u) page lsn=%X/%X.", + errdetail_internal("Last item on page tid=(%u,%u) page lsn=%X/%08X.", state->targetblock, offset, LSN_FORMAT_ARGS(state->targetlsn)))); } @@ -1813,7 +1811,7 @@ bt_target_page_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("right block of leaf block is non-leaf for index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Block=%u page lsn=%X/%X.", + errdetail_internal("Block=%u page lsn=%X/%08X.", state->targetblock, LSN_FORMAT_ARGS(state->targetlsn)))); @@ -2237,7 +2235,7 @@ bt_child_highkey_check(BtreeCheckState *state, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("the first child of leftmost target page is not leftmost of its level in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%08X.", state->targetblock, blkno, LSN_FORMAT_ARGS(state->targetlsn)))); @@ -2323,7 +2321,7 @@ bt_child_highkey_check(BtreeCheckState *state, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("child high key is greater than rightmost pivot key on target level in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%08X.", state->targetblock, blkno, LSN_FORMAT_ARGS(state->targetlsn)))); pivotkey_offset = P_HIKEY; @@ -2353,7 +2351,7 @@ bt_child_highkey_check(BtreeCheckState *state, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("can't find left sibling high key in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%08X.", state->targetblock, blkno, LSN_FORMAT_ARGS(state->targetlsn)))); itup = state->lowkey; @@ -2365,7 +2363,7 @@ bt_child_highkey_check(BtreeCheckState *state, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("mismatch between parent key and child high key in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%08X.", state->targetblock, blkno, LSN_FORMAT_ARGS(state->targetlsn)))); } @@ -2505,7 +2503,7 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("downlink to deleted page found in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Parent block=%u child block=%u parent page lsn=%X/%X.", + errdetail_internal("Parent block=%u child block=%u parent page lsn=%X/%08X.", state->targetblock, childblock, LSN_FORMAT_ARGS(state->targetlsn)))); @@ -2546,7 +2544,7 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("down-link lower bound invariant violated for index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Parent block=%u child index tid=(%u,%u) parent page lsn=%X/%X.", + errdetail_internal("Parent block=%u child index tid=(%u,%u) parent page lsn=%X/%08X.", state->targetblock, childblock, offset, LSN_FORMAT_ARGS(state->targetlsn)))); } @@ -2616,7 +2614,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, (errcode(ERRCODE_NO_DATA), errmsg_internal("harmless interrupted page split detected in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.", + errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%08X.", blkno, opaque->btpo_level, opaque->btpo_prev, LSN_FORMAT_ARGS(pagelsn)))); @@ -2638,7 +2636,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("leaf index block lacks downlink in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Block=%u page lsn=%X/%X.", + errdetail_internal("Block=%u page lsn=%X/%08X.", blkno, LSN_FORMAT_ARGS(pagelsn)))); @@ -2704,7 +2702,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("downlink to deleted leaf page found in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Top parent/target block=%u leaf block=%u top parent/under check lsn=%X/%X.", + errdetail_internal("Top parent/target block=%u leaf block=%u top parent/under check lsn=%X/%08X.", blkno, childblk, LSN_FORMAT_ARGS(pagelsn)))); @@ -2730,7 +2728,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("internal index block lacks downlink in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Block=%u level=%u page lsn=%X/%X.", + errdetail_internal("Block=%u level=%u page lsn=%X/%08X.", blkno, opaque->btpo_level, LSN_FORMAT_ARGS(pagelsn)))); } diff --git a/contrib/basebackup_to_shell/meson.build b/contrib/basebackup_to_shell/meson.build index 8c88242456e80..8a4f170c5f829 100644 --- a/contrib/basebackup_to_shell/meson.build +++ b/contrib/basebackup_to_shell/meson.build @@ -24,7 +24,7 @@ tests += { 'tests': [ 't/001_basic.pl', ], - 'env': {'GZIP_PROGRAM': gzip.found() ? gzip.path() : '', - 'TAR': tar.found() ? tar.path() : '' }, + 'env': {'GZIP_PROGRAM': gzip.found() ? gzip.full_path() : '', + 'TAR': tar.found() ? tar.full_path() : '' }, }, } diff --git a/contrib/basic_archive/basic_archive.c b/contrib/basic_archive/basic_archive.c index 4a8b8c7ac29c1..8fc633d2cbf62 100644 --- a/contrib/basic_archive/basic_archive.c +++ b/contrib/basic_archive/basic_archive.c @@ -65,7 +65,7 @@ void _PG_init(void) { DefineCustomStringVariable("basic_archive.archive_directory", - gettext_noop("Archive file destination directory."), + "Archive file destination directory.", NULL, &archive_directory, "", diff --git a/contrib/btree_gin/Makefile b/contrib/btree_gin/Makefile index 0a15811516819..ad054598db6c9 100644 --- a/contrib/btree_gin/Makefile +++ b/contrib/btree_gin/Makefile @@ -7,7 +7,7 @@ OBJS = \ EXTENSION = btree_gin DATA = btree_gin--1.0.sql btree_gin--1.0--1.1.sql btree_gin--1.1--1.2.sql \ - btree_gin--1.2--1.3.sql + btree_gin--1.2--1.3.sql btree_gin--1.3--1.4.sql PGFILEDESC = "btree_gin - B-tree equivalent GIN operator classes" REGRESS = install_btree_gin int2 int4 int8 float4 float8 money oid \ diff --git a/contrib/btree_gin/btree_gin--1.3--1.4.sql b/contrib/btree_gin/btree_gin--1.3--1.4.sql new file mode 100644 index 0000000000000..61b5dcbede6c5 --- /dev/null +++ b/contrib/btree_gin/btree_gin--1.3--1.4.sql @@ -0,0 +1,151 @@ +/* contrib/btree_gin/btree_gin--1.3--1.4.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION btree_gin UPDATE TO '1.4'" to load this file. \quit + +-- +-- Cross-type operator support is new in 1.4. We only need to worry +-- about this for cross-type operators that exist in core. +-- +-- Because the opclass extractQuery and consistent methods don't directly +-- get any information about the datatype of the RHS value, we have to +-- encode that in the operator strategy numbers. The strategy numbers +-- are the operator's normal btree strategy (1-5) plus 16 times a code +-- for the RHS datatype. +-- + +ALTER OPERATOR FAMILY int2_ops USING gin +ADD + -- Code 1: RHS is int4 + OPERATOR 0x11 < (int2, int4), + OPERATOR 0x12 <= (int2, int4), + OPERATOR 0x13 = (int2, int4), + OPERATOR 0x14 >= (int2, int4), + OPERATOR 0x15 > (int2, int4), + -- Code 2: RHS is int8 + OPERATOR 0x21 < (int2, int8), + OPERATOR 0x22 <= (int2, int8), + OPERATOR 0x23 = (int2, int8), + OPERATOR 0x24 >= (int2, int8), + OPERATOR 0x25 > (int2, int8) +; + +ALTER OPERATOR FAMILY int4_ops USING gin +ADD + -- Code 1: RHS is int2 + OPERATOR 0x11 < (int4, int2), + OPERATOR 0x12 <= (int4, int2), + OPERATOR 0x13 = (int4, int2), + OPERATOR 0x14 >= (int4, int2), + OPERATOR 0x15 > (int4, int2), + -- Code 2: RHS is int8 + OPERATOR 0x21 < (int4, int8), + OPERATOR 0x22 <= (int4, int8), + OPERATOR 0x23 = (int4, int8), + OPERATOR 0x24 >= (int4, int8), + OPERATOR 0x25 > (int4, int8) +; + +ALTER OPERATOR FAMILY int8_ops USING gin +ADD + -- Code 1: RHS is int2 + OPERATOR 0x11 < (int8, int2), + OPERATOR 0x12 <= (int8, int2), + OPERATOR 0x13 = (int8, int2), + OPERATOR 0x14 >= (int8, int2), + OPERATOR 0x15 > (int8, int2), + -- Code 2: RHS is int4 + OPERATOR 0x21 < (int8, int4), + OPERATOR 0x22 <= (int8, int4), + OPERATOR 0x23 = (int8, int4), + OPERATOR 0x24 >= (int8, int4), + OPERATOR 0x25 > (int8, int4) +; + +ALTER OPERATOR FAMILY float4_ops USING gin +ADD + -- Code 1: RHS is float8 + OPERATOR 0x11 < (float4, float8), + OPERATOR 0x12 <= (float4, float8), + OPERATOR 0x13 = (float4, float8), + OPERATOR 0x14 >= (float4, float8), + OPERATOR 0x15 > (float4, float8) +; + +ALTER OPERATOR FAMILY float8_ops USING gin +ADD + -- Code 1: RHS is float4 + OPERATOR 0x11 < (float8, float4), + OPERATOR 0x12 <= (float8, float4), + OPERATOR 0x13 = (float8, float4), + OPERATOR 0x14 >= (float8, float4), + OPERATOR 0x15 > (float8, float4) +; + +ALTER OPERATOR FAMILY text_ops USING gin +ADD + -- Code 1: RHS is name + OPERATOR 0x11 < (text, name), + OPERATOR 0x12 <= (text, name), + OPERATOR 0x13 = (text, name), + OPERATOR 0x14 >= (text, name), + OPERATOR 0x15 > (text, name) +; + +ALTER OPERATOR FAMILY name_ops USING gin +ADD + -- Code 1: RHS is text + OPERATOR 0x11 < (name, text), + OPERATOR 0x12 <= (name, text), + OPERATOR 0x13 = (name, text), + OPERATOR 0x14 >= (name, text), + OPERATOR 0x15 > (name, text) +; + +ALTER OPERATOR FAMILY date_ops USING gin +ADD + -- Code 1: RHS is timestamp + OPERATOR 0x11 < (date, timestamp), + OPERATOR 0x12 <= (date, timestamp), + OPERATOR 0x13 = (date, timestamp), + OPERATOR 0x14 >= (date, timestamp), + OPERATOR 0x15 > (date, timestamp), + -- Code 2: RHS is timestamptz + OPERATOR 0x21 < (date, timestamptz), + OPERATOR 0x22 <= (date, timestamptz), + OPERATOR 0x23 = (date, timestamptz), + OPERATOR 0x24 >= (date, timestamptz), + OPERATOR 0x25 > (date, timestamptz) +; + +ALTER OPERATOR FAMILY timestamp_ops USING gin +ADD + -- Code 1: RHS is date + OPERATOR 0x11 < (timestamp, date), + OPERATOR 0x12 <= (timestamp, date), + OPERATOR 0x13 = (timestamp, date), + OPERATOR 0x14 >= (timestamp, date), + OPERATOR 0x15 > (timestamp, date), + -- Code 2: RHS is timestamptz + OPERATOR 0x21 < (timestamp, timestamptz), + OPERATOR 0x22 <= (timestamp, timestamptz), + OPERATOR 0x23 = (timestamp, timestamptz), + OPERATOR 0x24 >= (timestamp, timestamptz), + OPERATOR 0x25 > (timestamp, timestamptz) +; + +ALTER OPERATOR FAMILY timestamptz_ops USING gin +ADD + -- Code 1: RHS is date + OPERATOR 0x11 < (timestamptz, date), + OPERATOR 0x12 <= (timestamptz, date), + OPERATOR 0x13 = (timestamptz, date), + OPERATOR 0x14 >= (timestamptz, date), + OPERATOR 0x15 > (timestamptz, date), + -- Code 2: RHS is timestamp + OPERATOR 0x21 < (timestamptz, timestamp), + OPERATOR 0x22 <= (timestamptz, timestamp), + OPERATOR 0x23 = (timestamptz, timestamp), + OPERATOR 0x24 >= (timestamptz, timestamp), + OPERATOR 0x25 > (timestamptz, timestamp) +; diff --git a/contrib/btree_gin/btree_gin.c b/contrib/btree_gin/btree_gin.c index 98663cb86117e..8c477d17e22ce 100644 --- a/contrib/btree_gin/btree_gin.c +++ b/contrib/btree_gin/btree_gin.c @@ -6,6 +6,7 @@ #include #include "access/stratnum.h" +#include "mb/pg_wchar.h" #include "utils/builtins.h" #include "utils/date.h" #include "utils/float.h" @@ -13,20 +14,36 @@ #include "utils/numeric.h" #include "utils/timestamp.h" #include "utils/uuid.h" +#include "varatt.h" PG_MODULE_MAGIC_EXT( .name = "btree_gin", .version = PG_VERSION ); +/* + * Our opclasses use the same strategy numbers as btree (1-5) for same-type + * comparison operators. For cross-type comparison operators, the + * low 4 bits of our strategy numbers are the btree strategy number, + * and the upper bits are a code for the right-hand-side data type. + */ +#define BTGIN_GET_BTREE_STRATEGY(strat) ((strat) & 0x0F) +#define BTGIN_GET_RHS_TYPE_CODE(strat) ((strat) >> 4) + +/* extra data passed from gin_btree_extract_query to gin_btree_compare_prefix */ typedef struct QueryInfo { - StrategyNumber strategy; - Datum datum; - bool is_varlena; - Datum (*typecmp) (FunctionCallInfo); + StrategyNumber strategy; /* operator strategy number */ + Datum orig_datum; /* original query (comparison) datum */ + Datum entry_datum; /* datum we reported as the entry value */ + PGFunction typecmp; /* appropriate btree comparison function */ } QueryInfo; +typedef Datum (*btree_gin_convert_function) (Datum input); + +typedef Datum (*btree_gin_leftmost_function) (void); + + /*** GIN support functions shared by all datatypes ***/ static Datum @@ -36,6 +53,7 @@ gin_btree_extract_value(FunctionCallInfo fcinfo, bool is_varlena) int32 *nentries = (int32 *) PG_GETARG_POINTER(1); Datum *entries = (Datum *) palloc(sizeof(Datum)); + /* Ensure that values stored in the index are not toasted */ if (is_varlena) datum = PointerGetDatum(PG_DETOAST_DATUM(datum)); entries[0] = datum; @@ -44,19 +62,12 @@ gin_btree_extract_value(FunctionCallInfo fcinfo, bool is_varlena) PG_RETURN_POINTER(entries); } -/* - * For BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, and - * BTEqualStrategyNumber we want to start the index scan at the - * supplied query datum, and work forward. For BTLessStrategyNumber - * and BTLessEqualStrategyNumber, we need to start at the leftmost - * key, and work forward until the supplied query datum (which must be - * sent along inside the QueryInfo structure). - */ static Datum gin_btree_extract_query(FunctionCallInfo fcinfo, - bool is_varlena, - Datum (*leftmostvalue) (void), - Datum (*typecmp) (FunctionCallInfo)) + btree_gin_leftmost_function leftmostvalue, + const bool *rhs_is_varlena, + const btree_gin_convert_function *cvt_fns, + const PGFunction *cmp_fns) { Datum datum = PG_GETARG_DATUM(0); int32 *nentries = (int32 *) PG_GETARG_POINTER(1); @@ -65,21 +76,40 @@ gin_btree_extract_query(FunctionCallInfo fcinfo, Pointer **extra_data = (Pointer **) PG_GETARG_POINTER(4); Datum *entries = (Datum *) palloc(sizeof(Datum)); QueryInfo *data = (QueryInfo *) palloc(sizeof(QueryInfo)); - bool *ptr_partialmatch; + bool *ptr_partialmatch = (bool *) palloc(sizeof(bool)); + int btree_strat, + rhs_code; + + /* + * Extract the btree strategy code and the RHS data type code from the + * given strategy number. + */ + btree_strat = BTGIN_GET_BTREE_STRATEGY(strategy); + rhs_code = BTGIN_GET_RHS_TYPE_CODE(strategy); + /* + * Detoast the comparison datum. This isn't necessary for correctness, + * but it can save repeat detoastings within the comparison function. + */ + if (rhs_is_varlena[rhs_code]) + datum = PointerGetDatum(PG_DETOAST_DATUM(datum)); + + /* Prep single comparison key with possible partial-match flag */ *nentries = 1; - ptr_partialmatch = *partialmatch = (bool *) palloc(sizeof(bool)); + *partialmatch = ptr_partialmatch; *ptr_partialmatch = false; - if (is_varlena) - datum = PointerGetDatum(PG_DETOAST_DATUM(datum)); - data->strategy = strategy; - data->datum = datum; - data->is_varlena = is_varlena; - data->typecmp = typecmp; - *extra_data = (Pointer *) palloc(sizeof(Pointer)); - **extra_data = (Pointer) data; - switch (strategy) + /* + * For BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, and + * BTEqualStrategyNumber we want to start the index scan at the supplied + * query datum, and work forward. For BTLessStrategyNumber and + * BTLessEqualStrategyNumber, we need to start at the leftmost key, and + * work forward until the supplied query datum (which we'll send along + * inside the QueryInfo structure). Use partial match rules except for + * BTEqualStrategyNumber without a conversion function. (If there is a + * conversion function, comparison to the entry value is not trustworthy.) + */ + switch (btree_strat) { case BTLessStrategyNumber: case BTLessEqualStrategyNumber: @@ -91,75 +121,106 @@ gin_btree_extract_query(FunctionCallInfo fcinfo, *ptr_partialmatch = true; /* FALLTHROUGH */ case BTEqualStrategyNumber: - entries[0] = datum; + /* If we have a conversion function, apply it */ + if (cvt_fns && cvt_fns[rhs_code]) + { + entries[0] = (*cvt_fns[rhs_code]) (datum); + *ptr_partialmatch = true; + } + else + entries[0] = datum; break; default: elog(ERROR, "unrecognized strategy number: %d", strategy); } + /* Fill "extra" data */ + data->strategy = strategy; + data->orig_datum = datum; + data->entry_datum = entries[0]; + data->typecmp = cmp_fns[rhs_code]; + *extra_data = (Pointer *) palloc(sizeof(Pointer)); + **extra_data = (Pointer) data; + PG_RETURN_POINTER(entries); } -/* - * Datum a is a value from extract_query method and for BTLess* - * strategy it is a left-most value. So, use original datum from QueryInfo - * to decide to stop scanning or not. Datum b is always from index. - */ static Datum gin_btree_compare_prefix(FunctionCallInfo fcinfo) { - Datum a = PG_GETARG_DATUM(0); - Datum b = PG_GETARG_DATUM(1); + Datum partial_key PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_DATUM(0); + Datum key = PG_GETARG_DATUM(1); QueryInfo *data = (QueryInfo *) PG_GETARG_POINTER(3); int32 res, cmp; + /* + * partial_key is only an approximation to the real comparison value, + * especially if it's a leftmost value. We can get an accurate answer by + * doing a possibly-cross-type comparison to the real comparison value. + * (Note that partial_key and key are of the indexed datatype while + * orig_datum is of the query operator's RHS datatype.) + * + * But just to be sure that things are what we expect, let's assert that + * partial_key is indeed what gin_btree_extract_query reported, so that + * we'll notice if anyone ever changes the core code in a way that breaks + * our assumptions. + */ + Assert(partial_key == data->entry_datum); + cmp = DatumGetInt32(CallerFInfoFunctionCall2(data->typecmp, fcinfo->flinfo, PG_GET_COLLATION(), - (data->strategy == BTLessStrategyNumber || - data->strategy == BTLessEqualStrategyNumber) - ? data->datum : a, - b)); + data->orig_datum, + key)); - switch (data->strategy) + /* + * Convert the comparison result to the correct thing for the search + * operator strategy. When dealing with cross-type comparisons, an + * imprecise entry datum could lead GIN to start the scan just before the + * first possible match, so we must continue the scan if the current index + * entry doesn't satisfy the search condition for >= and > cases. But if + * that happens in an = search we can stop, because an imprecise entry + * datum means that the search value is unrepresentable in the indexed + * data type, so that there will be no exact matches. + */ + switch (BTGIN_GET_BTREE_STRATEGY(data->strategy)) { case BTLessStrategyNumber: /* If original datum > indexed one then return match */ if (cmp > 0) res = 0; else - res = 1; + res = 1; /* end scan */ break; case BTLessEqualStrategyNumber: - /* The same except equality */ + /* If original datum >= indexed one then return match */ if (cmp >= 0) res = 0; else - res = 1; + res = 1; /* end scan */ break; case BTEqualStrategyNumber: - if (cmp != 0) - res = 1; - else + /* If original datum = indexed one then return match */ + /* See above about why we can end scan when cmp < 0 */ + if (cmp == 0) res = 0; + else + res = 1; /* end scan */ break; case BTGreaterEqualStrategyNumber: /* If original datum <= indexed one then return match */ if (cmp <= 0) res = 0; else - res = 1; + res = -1; /* keep scanning */ break; case BTGreaterStrategyNumber: - /* If original datum <= indexed one then return match */ - /* If original datum == indexed one then continue scan */ + /* If original datum < indexed one then return match */ if (cmp < 0) res = 0; - else if (cmp == 0) - res = -1; else - res = 1; + res = -1; /* keep scanning */ break; default: elog(ERROR, "unrecognized strategy number: %d", @@ -182,19 +243,20 @@ gin_btree_consistent(PG_FUNCTION_ARGS) /*** GIN_SUPPORT macro defines the datatype specific functions ***/ -#define GIN_SUPPORT(type, is_varlena, leftmostvalue, typecmp) \ +#define GIN_SUPPORT(type, leftmostvalue, is_varlena, cvtfns, cmpfns) \ PG_FUNCTION_INFO_V1(gin_extract_value_##type); \ Datum \ gin_extract_value_##type(PG_FUNCTION_ARGS) \ { \ - return gin_btree_extract_value(fcinfo, is_varlena); \ + return gin_btree_extract_value(fcinfo, is_varlena[0]); \ } \ PG_FUNCTION_INFO_V1(gin_extract_query_##type); \ Datum \ gin_extract_query_##type(PG_FUNCTION_ARGS) \ { \ return gin_btree_extract_query(fcinfo, \ - is_varlena, leftmostvalue, typecmp); \ + leftmostvalue, is_varlena, \ + cvtfns, cmpfns); \ } \ PG_FUNCTION_INFO_V1(gin_compare_prefix_##type); \ Datum \ @@ -206,13 +268,66 @@ gin_compare_prefix_##type(PG_FUNCTION_ARGS) \ /*** Datatype specifications ***/ +/* Function to produce the least possible value of the indexed datatype */ static Datum leftmostvalue_int2(void) { return Int16GetDatum(SHRT_MIN); } -GIN_SUPPORT(int2, false, leftmostvalue_int2, btint2cmp) +/* + * For cross-type support, we must provide conversion functions that produce + * a Datum of the indexed datatype, since GIN requires the "entry" datums to + * be of that type. If an exact conversion is not possible, produce a value + * that will lead GIN to find the first index entry that is greater than + * or equal to the actual comparison value. (But rounding down is OK, so + * sometimes we might find an index entry that's just less than the + * comparison value.) + * + * For integer values, it's sufficient to clamp the input to be in-range. + * + * Note: for out-of-range input values, we could in theory detect that the + * search condition matches all or none of the index, and avoid a useless + * index descent in the latter case. Such searches are probably rare though, + * so we don't contort this code enough to do that. + */ +static Datum +cvt_int4_int2(Datum input) +{ + int32 val = DatumGetInt32(input); + + val = Max(val, SHRT_MIN); + val = Min(val, SHRT_MAX); + return Int16GetDatum((int16) val); +} + +static Datum +cvt_int8_int2(Datum input) +{ + int64 val = DatumGetInt64(input); + + val = Max(val, SHRT_MIN); + val = Min(val, SHRT_MAX); + return Int16GetDatum((int16) val); +} + +/* + * RHS-type-is-varlena flags, conversion and comparison function arrays, + * indexed by high bits of the operator strategy number. A NULL in the + * conversion function array indicates that no conversion is needed, which + * will always be the case for the zero'th entry. Note that the cross-type + * comparison functions should be the ones with the indexed datatype second. + */ +static const bool int2_rhs_is_varlena[] = +{false, false, false}; + +static const btree_gin_convert_function int2_cvt_fns[] = +{NULL, cvt_int4_int2, cvt_int8_int2}; + +static const PGFunction int2_cmp_fns[] = +{btint2cmp, btint42cmp, btint82cmp}; + +GIN_SUPPORT(int2, leftmostvalue_int2, int2_rhs_is_varlena, int2_cvt_fns, int2_cmp_fns) static Datum leftmostvalue_int4(void) @@ -220,7 +335,34 @@ leftmostvalue_int4(void) return Int32GetDatum(INT_MIN); } -GIN_SUPPORT(int4, false, leftmostvalue_int4, btint4cmp) +static Datum +cvt_int2_int4(Datum input) +{ + int16 val = DatumGetInt16(input); + + return Int32GetDatum((int32) val); +} + +static Datum +cvt_int8_int4(Datum input) +{ + int64 val = DatumGetInt64(input); + + val = Max(val, INT_MIN); + val = Min(val, INT_MAX); + return Int32GetDatum((int32) val); +} + +static const bool int4_rhs_is_varlena[] = +{false, false, false}; + +static const btree_gin_convert_function int4_cvt_fns[] = +{NULL, cvt_int2_int4, cvt_int8_int4}; + +static const PGFunction int4_cmp_fns[] = +{btint4cmp, btint24cmp, btint84cmp}; + +GIN_SUPPORT(int4, leftmostvalue_int4, int4_rhs_is_varlena, int4_cvt_fns, int4_cmp_fns) static Datum leftmostvalue_int8(void) @@ -228,7 +370,32 @@ leftmostvalue_int8(void) return Int64GetDatum(PG_INT64_MIN); } -GIN_SUPPORT(int8, false, leftmostvalue_int8, btint8cmp) +static Datum +cvt_int2_int8(Datum input) +{ + int16 val = DatumGetInt16(input); + + return Int64GetDatum((int64) val); +} + +static Datum +cvt_int4_int8(Datum input) +{ + int32 val = DatumGetInt32(input); + + return Int64GetDatum((int64) val); +} + +static const bool int8_rhs_is_varlena[] = +{false, false, false}; + +static const btree_gin_convert_function int8_cvt_fns[] = +{NULL, cvt_int2_int8, cvt_int4_int8}; + +static const PGFunction int8_cmp_fns[] = +{btint8cmp, btint28cmp, btint48cmp}; + +GIN_SUPPORT(int8, leftmostvalue_int8, int8_rhs_is_varlena, int8_cvt_fns, int8_cmp_fns) static Datum leftmostvalue_float4(void) @@ -236,7 +403,34 @@ leftmostvalue_float4(void) return Float4GetDatum(-get_float4_infinity()); } -GIN_SUPPORT(float4, false, leftmostvalue_float4, btfloat4cmp) +static Datum +cvt_float8_float4(Datum input) +{ + float8 val = DatumGetFloat8(input); + float4 result; + + /* + * Assume that ordinary C conversion will produce a usable result. + * (Compare dtof(), which raises error conditions that we don't need.) + * Note that for inputs that aren't exactly representable as float4, it + * doesn't matter whether the conversion rounds up or down. That might + * cause us to scan a few index entries that we'll reject as not matching, + * but we won't miss any that should match. + */ + result = (float4) val; + return Float4GetDatum(result); +} + +static const bool float4_rhs_is_varlena[] = +{false, false}; + +static const btree_gin_convert_function float4_cvt_fns[] = +{NULL, cvt_float8_float4}; + +static const PGFunction float4_cmp_fns[] = +{btfloat4cmp, btfloat84cmp}; + +GIN_SUPPORT(float4, leftmostvalue_float4, float4_rhs_is_varlena, float4_cvt_fns, float4_cmp_fns) static Datum leftmostvalue_float8(void) @@ -244,7 +438,24 @@ leftmostvalue_float8(void) return Float8GetDatum(-get_float8_infinity()); } -GIN_SUPPORT(float8, false, leftmostvalue_float8, btfloat8cmp) +static Datum +cvt_float4_float8(Datum input) +{ + float4 val = DatumGetFloat4(input); + + return Float8GetDatum((float8) val); +} + +static const bool float8_rhs_is_varlena[] = +{false, false}; + +static const btree_gin_convert_function float8_cvt_fns[] = +{NULL, cvt_float4_float8}; + +static const PGFunction float8_cmp_fns[] = +{btfloat8cmp, btfloat48cmp}; + +GIN_SUPPORT(float8, leftmostvalue_float8, float8_rhs_is_varlena, float8_cvt_fns, float8_cmp_fns) static Datum leftmostvalue_money(void) @@ -252,7 +463,13 @@ leftmostvalue_money(void) return Int64GetDatum(PG_INT64_MIN); } -GIN_SUPPORT(money, false, leftmostvalue_money, cash_cmp) +static const bool money_rhs_is_varlena[] = +{false}; + +static const PGFunction money_cmp_fns[] = +{cash_cmp}; + +GIN_SUPPORT(money, leftmostvalue_money, money_rhs_is_varlena, NULL, money_cmp_fns) static Datum leftmostvalue_oid(void) @@ -260,7 +477,13 @@ leftmostvalue_oid(void) return ObjectIdGetDatum(0); } -GIN_SUPPORT(oid, false, leftmostvalue_oid, btoidcmp) +static const bool oid_rhs_is_varlena[] = +{false}; + +static const PGFunction oid_cmp_fns[] = +{btoidcmp}; + +GIN_SUPPORT(oid, leftmostvalue_oid, oid_rhs_is_varlena, NULL, oid_cmp_fns) static Datum leftmostvalue_timestamp(void) @@ -268,9 +491,75 @@ leftmostvalue_timestamp(void) return TimestampGetDatum(DT_NOBEGIN); } -GIN_SUPPORT(timestamp, false, leftmostvalue_timestamp, timestamp_cmp) +static Datum +cvt_date_timestamp(Datum input) +{ + DateADT val = DatumGetDateADT(input); + Timestamp result; + int overflow; -GIN_SUPPORT(timestamptz, false, leftmostvalue_timestamp, timestamp_cmp) + result = date2timestamp_opt_overflow(val, &overflow); + /* We can ignore the overflow result, since result is useful as-is */ + return TimestampGetDatum(result); +} + +static Datum +cvt_timestamptz_timestamp(Datum input) +{ + TimestampTz val = DatumGetTimestampTz(input); + Timestamp result; + int overflow; + + result = timestamptz2timestamp_opt_overflow(val, &overflow); + /* We can ignore the overflow result, since result is useful as-is */ + return TimestampGetDatum(result); +} + +static const bool timestamp_rhs_is_varlena[] = +{false, false, false}; + +static const btree_gin_convert_function timestamp_cvt_fns[] = +{NULL, cvt_date_timestamp, cvt_timestamptz_timestamp}; + +static const PGFunction timestamp_cmp_fns[] = +{timestamp_cmp, date_cmp_timestamp, timestamptz_cmp_timestamp}; + +GIN_SUPPORT(timestamp, leftmostvalue_timestamp, timestamp_rhs_is_varlena, timestamp_cvt_fns, timestamp_cmp_fns) + +static Datum +cvt_date_timestamptz(Datum input) +{ + DateADT val = DatumGetDateADT(input); + TimestampTz result; + int overflow; + + result = date2timestamptz_opt_overflow(val, &overflow); + /* We can ignore the overflow result, since result is useful as-is */ + return TimestampTzGetDatum(result); +} + +static Datum +cvt_timestamp_timestamptz(Datum input) +{ + Timestamp val = DatumGetTimestamp(input); + TimestampTz result; + int overflow; + + result = timestamp2timestamptz_opt_overflow(val, &overflow); + /* We can ignore the overflow result, since result is useful as-is */ + return TimestampTzGetDatum(result); +} + +static const bool timestamptz_rhs_is_varlena[] = +{false, false, false}; + +static const btree_gin_convert_function timestamptz_cvt_fns[] = +{NULL, cvt_date_timestamptz, cvt_timestamp_timestamptz}; + +static const PGFunction timestamptz_cmp_fns[] = +{timestamp_cmp, date_cmp_timestamptz, timestamp_cmp_timestamptz}; + +GIN_SUPPORT(timestamptz, leftmostvalue_timestamp, timestamptz_rhs_is_varlena, timestamptz_cvt_fns, timestamptz_cmp_fns) static Datum leftmostvalue_time(void) @@ -278,7 +567,13 @@ leftmostvalue_time(void) return TimeADTGetDatum(0); } -GIN_SUPPORT(time, false, leftmostvalue_time, time_cmp) +static const bool time_rhs_is_varlena[] = +{false}; + +static const PGFunction time_cmp_fns[] = +{time_cmp}; + +GIN_SUPPORT(time, leftmostvalue_time, time_rhs_is_varlena, NULL, time_cmp_fns) static Datum leftmostvalue_timetz(void) @@ -291,7 +586,13 @@ leftmostvalue_timetz(void) return TimeTzADTPGetDatum(v); } -GIN_SUPPORT(timetz, false, leftmostvalue_timetz, timetz_cmp) +static const bool timetz_rhs_is_varlena[] = +{false}; + +static const PGFunction timetz_cmp_fns[] = +{timetz_cmp}; + +GIN_SUPPORT(timetz, leftmostvalue_timetz, timetz_rhs_is_varlena, NULL, timetz_cmp_fns) static Datum leftmostvalue_date(void) @@ -299,7 +600,40 @@ leftmostvalue_date(void) return DateADTGetDatum(DATEVAL_NOBEGIN); } -GIN_SUPPORT(date, false, leftmostvalue_date, date_cmp) +static Datum +cvt_timestamp_date(Datum input) +{ + Timestamp val = DatumGetTimestamp(input); + DateADT result; + int overflow; + + result = timestamp2date_opt_overflow(val, &overflow); + /* We can ignore the overflow result, since result is useful as-is */ + return DateADTGetDatum(result); +} + +static Datum +cvt_timestamptz_date(Datum input) +{ + TimestampTz val = DatumGetTimestampTz(input); + DateADT result; + int overflow; + + result = timestamptz2date_opt_overflow(val, &overflow); + /* We can ignore the overflow result, since result is useful as-is */ + return DateADTGetDatum(result); +} + +static const bool date_rhs_is_varlena[] = +{false, false, false}; + +static const btree_gin_convert_function date_cvt_fns[] = +{NULL, cvt_timestamp_date, cvt_timestamptz_date}; + +static const PGFunction date_cmp_fns[] = +{date_cmp, timestamp_cmp_date, timestamptz_cmp_date}; + +GIN_SUPPORT(date, leftmostvalue_date, date_rhs_is_varlena, date_cvt_fns, date_cmp_fns) static Datum leftmostvalue_interval(void) @@ -311,7 +645,13 @@ leftmostvalue_interval(void) return IntervalPGetDatum(v); } -GIN_SUPPORT(interval, false, leftmostvalue_interval, interval_cmp) +static const bool interval_rhs_is_varlena[] = +{false}; + +static const PGFunction interval_cmp_fns[] = +{interval_cmp}; + +GIN_SUPPORT(interval, leftmostvalue_interval, interval_rhs_is_varlena, NULL, interval_cmp_fns) static Datum leftmostvalue_macaddr(void) @@ -321,7 +661,13 @@ leftmostvalue_macaddr(void) return MacaddrPGetDatum(v); } -GIN_SUPPORT(macaddr, false, leftmostvalue_macaddr, macaddr_cmp) +static const bool macaddr_rhs_is_varlena[] = +{false}; + +static const PGFunction macaddr_cmp_fns[] = +{macaddr_cmp}; + +GIN_SUPPORT(macaddr, leftmostvalue_macaddr, macaddr_rhs_is_varlena, NULL, macaddr_cmp_fns) static Datum leftmostvalue_macaddr8(void) @@ -331,7 +677,13 @@ leftmostvalue_macaddr8(void) return Macaddr8PGetDatum(v); } -GIN_SUPPORT(macaddr8, false, leftmostvalue_macaddr8, macaddr8_cmp) +static const bool macaddr8_rhs_is_varlena[] = +{false}; + +static const PGFunction macaddr8_cmp_fns[] = +{macaddr8_cmp}; + +GIN_SUPPORT(macaddr8, leftmostvalue_macaddr8, macaddr8_rhs_is_varlena, NULL, macaddr8_cmp_fns) static Datum leftmostvalue_inet(void) @@ -339,9 +691,21 @@ leftmostvalue_inet(void) return DirectFunctionCall1(inet_in, CStringGetDatum("0.0.0.0/0")); } -GIN_SUPPORT(inet, true, leftmostvalue_inet, network_cmp) +static const bool inet_rhs_is_varlena[] = +{true}; + +static const PGFunction inet_cmp_fns[] = +{network_cmp}; + +GIN_SUPPORT(inet, leftmostvalue_inet, inet_rhs_is_varlena, NULL, inet_cmp_fns) -GIN_SUPPORT(cidr, true, leftmostvalue_inet, network_cmp) +static const bool cidr_rhs_is_varlena[] = +{true}; + +static const PGFunction cidr_cmp_fns[] = +{network_cmp}; + +GIN_SUPPORT(cidr, leftmostvalue_inet, cidr_rhs_is_varlena, NULL, cidr_cmp_fns) static Datum leftmostvalue_text(void) @@ -349,9 +713,32 @@ leftmostvalue_text(void) return PointerGetDatum(cstring_to_text_with_len("", 0)); } -GIN_SUPPORT(text, true, leftmostvalue_text, bttextcmp) +static Datum +cvt_name_text(Datum input) +{ + Name val = DatumGetName(input); + + return PointerGetDatum(cstring_to_text(NameStr(*val))); +} -GIN_SUPPORT(bpchar, true, leftmostvalue_text, bpcharcmp) +static const bool text_rhs_is_varlena[] = +{true, false}; + +static const btree_gin_convert_function text_cvt_fns[] = +{NULL, cvt_name_text}; + +static const PGFunction text_cmp_fns[] = +{bttextcmp, btnametextcmp}; + +GIN_SUPPORT(text, leftmostvalue_text, text_rhs_is_varlena, text_cvt_fns, text_cmp_fns) + +static const bool bpchar_rhs_is_varlena[] = +{true}; + +static const PGFunction bpchar_cmp_fns[] = +{bpcharcmp}; + +GIN_SUPPORT(bpchar, leftmostvalue_text, bpchar_rhs_is_varlena, NULL, bpchar_cmp_fns) static Datum leftmostvalue_char(void) @@ -359,9 +746,21 @@ leftmostvalue_char(void) return CharGetDatum(0); } -GIN_SUPPORT(char, false, leftmostvalue_char, btcharcmp) +static const bool char_rhs_is_varlena[] = +{false}; -GIN_SUPPORT(bytea, true, leftmostvalue_text, byteacmp) +static const PGFunction char_cmp_fns[] = +{btcharcmp}; + +GIN_SUPPORT(char, leftmostvalue_char, char_rhs_is_varlena, NULL, char_cmp_fns) + +static const bool bytea_rhs_is_varlena[] = +{true}; + +static const PGFunction bytea_cmp_fns[] = +{byteacmp}; + +GIN_SUPPORT(bytea, leftmostvalue_text, bytea_rhs_is_varlena, NULL, bytea_cmp_fns) static Datum leftmostvalue_bit(void) @@ -372,7 +771,13 @@ leftmostvalue_bit(void) Int32GetDatum(-1)); } -GIN_SUPPORT(bit, true, leftmostvalue_bit, bitcmp) +static const bool bit_rhs_is_varlena[] = +{true}; + +static const PGFunction bit_cmp_fns[] = +{bitcmp}; + +GIN_SUPPORT(bit, leftmostvalue_bit, bit_rhs_is_varlena, NULL, bit_cmp_fns) static Datum leftmostvalue_varbit(void) @@ -383,7 +788,13 @@ leftmostvalue_varbit(void) Int32GetDatum(-1)); } -GIN_SUPPORT(varbit, true, leftmostvalue_varbit, bitcmp) +static const bool varbit_rhs_is_varlena[] = +{true}; + +static const PGFunction varbit_cmp_fns[] = +{bitcmp}; + +GIN_SUPPORT(varbit, leftmostvalue_varbit, varbit_rhs_is_varlena, NULL, varbit_cmp_fns) /* * Numeric type hasn't a real left-most value, so we use PointerGetDatum(NULL) @@ -428,7 +839,13 @@ leftmostvalue_numeric(void) return PointerGetDatum(NULL); } -GIN_SUPPORT(numeric, true, leftmostvalue_numeric, gin_numeric_cmp) +static const bool numeric_rhs_is_varlena[] = +{true}; + +static const PGFunction numeric_cmp_fns[] = +{gin_numeric_cmp}; + +GIN_SUPPORT(numeric, leftmostvalue_numeric, numeric_rhs_is_varlena, NULL, numeric_cmp_fns) /* * Use a similar trick to that used for numeric for enums, since we don't @@ -477,7 +894,13 @@ leftmostvalue_enum(void) return ObjectIdGetDatum(InvalidOid); } -GIN_SUPPORT(anyenum, false, leftmostvalue_enum, gin_enum_cmp) +static const bool enum_rhs_is_varlena[] = +{false}; + +static const PGFunction enum_cmp_fns[] = +{gin_enum_cmp}; + +GIN_SUPPORT(anyenum, leftmostvalue_enum, enum_rhs_is_varlena, NULL, enum_cmp_fns) static Datum leftmostvalue_uuid(void) @@ -491,7 +914,13 @@ leftmostvalue_uuid(void) return UUIDPGetDatum(retval); } -GIN_SUPPORT(uuid, false, leftmostvalue_uuid, uuid_cmp) +static const bool uuid_rhs_is_varlena[] = +{false}; + +static const PGFunction uuid_cmp_fns[] = +{uuid_cmp}; + +GIN_SUPPORT(uuid, leftmostvalue_uuid, uuid_rhs_is_varlena, NULL, uuid_cmp_fns) static Datum leftmostvalue_name(void) @@ -501,7 +930,37 @@ leftmostvalue_name(void) return NameGetDatum(result); } -GIN_SUPPORT(name, false, leftmostvalue_name, btnamecmp) +static Datum +cvt_text_name(Datum input) +{ + text *val = DatumGetTextPP(input); + NameData *result = (NameData *) palloc0(NAMEDATALEN); + int len = VARSIZE_ANY_EXHDR(val); + + /* + * Truncate oversize input. We're assuming this will produce a result + * considered less than the original. That could be a bad assumption in + * some collations, but fortunately an index on "name" is generally going + * to use C collation. + */ + if (len >= NAMEDATALEN) + len = pg_mbcliplen(VARDATA_ANY(val), len, NAMEDATALEN - 1); + + memcpy(NameStr(*result), VARDATA_ANY(val), len); + + return NameGetDatum(result); +} + +static const bool name_rhs_is_varlena[] = +{false, true}; + +static const btree_gin_convert_function name_cvt_fns[] = +{NULL, cvt_text_name}; + +static const PGFunction name_cmp_fns[] = +{btnamecmp, bttextnamecmp}; + +GIN_SUPPORT(name, leftmostvalue_name, name_rhs_is_varlena, name_cvt_fns, name_cmp_fns) static Datum leftmostvalue_bool(void) @@ -509,4 +968,10 @@ leftmostvalue_bool(void) return BoolGetDatum(false); } -GIN_SUPPORT(bool, false, leftmostvalue_bool, btboolcmp) +static const bool bool_rhs_is_varlena[] = +{false}; + +static const PGFunction bool_cmp_fns[] = +{btboolcmp}; + +GIN_SUPPORT(bool, leftmostvalue_bool, bool_rhs_is_varlena, NULL, bool_cmp_fns) diff --git a/contrib/btree_gin/btree_gin.control b/contrib/btree_gin/btree_gin.control index 67d0c997d8d26..0c77c81727117 100644 --- a/contrib/btree_gin/btree_gin.control +++ b/contrib/btree_gin/btree_gin.control @@ -1,6 +1,6 @@ # btree_gin extension comment = 'support for indexing common datatypes in GIN' -default_version = '1.3' +default_version = '1.4' module_pathname = '$libdir/btree_gin' relocatable = true trusted = true diff --git a/contrib/btree_gin/expected/date.out b/contrib/btree_gin/expected/date.out index 40dfa308cf753..e69c1da2000f2 100644 --- a/contrib/btree_gin/expected/date.out +++ b/contrib/btree_gin/expected/date.out @@ -49,3 +49,365 @@ SELECT * FROM test_date WHERE i>'2004-10-26'::date ORDER BY i; 10-28-2004 (2 rows) +explain (costs off) +SELECT * FROM test_date WHERE i<'2004-10-26'::timestamp ORDER BY i; + QUERY PLAN +----------------------------------------------------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_date + Recheck Cond: (i < 'Tue Oct 26 00:00:00 2004'::timestamp without time zone) + -> Bitmap Index Scan on idx_date + Index Cond: (i < 'Tue Oct 26 00:00:00 2004'::timestamp without time zone) +(6 rows) + +SELECT * FROM test_date WHERE i<'2004-10-26'::timestamp ORDER BY i; + i +------------ + 10-23-2004 + 10-24-2004 + 10-25-2004 +(3 rows) + +SELECT * FROM test_date WHERE i<='2004-10-26'::timestamp ORDER BY i; + i +------------ + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 +(4 rows) + +SELECT * FROM test_date WHERE i='2004-10-26'::timestamp ORDER BY i; + i +------------ + 10-26-2004 +(1 row) + +SELECT * FROM test_date WHERE i>='2004-10-26'::timestamp ORDER BY i; + i +------------ + 10-26-2004 + 10-27-2004 + 10-28-2004 +(3 rows) + +SELECT * FROM test_date WHERE i>'2004-10-26'::timestamp ORDER BY i; + i +------------ + 10-27-2004 + 10-28-2004 +(2 rows) + +explain (costs off) +SELECT * FROM test_date WHERE i<'2004-10-26'::timestamptz ORDER BY i; + QUERY PLAN +------------------------------------------------------------------------------------------ + Sort + Sort Key: i + -> Bitmap Heap Scan on test_date + Recheck Cond: (i < 'Tue Oct 26 00:00:00 2004 PDT'::timestamp with time zone) + -> Bitmap Index Scan on idx_date + Index Cond: (i < 'Tue Oct 26 00:00:00 2004 PDT'::timestamp with time zone) +(6 rows) + +SELECT * FROM test_date WHERE i<'2004-10-26'::timestamptz ORDER BY i; + i +------------ + 10-23-2004 + 10-24-2004 + 10-25-2004 +(3 rows) + +SELECT * FROM test_date WHERE i<='2004-10-26'::timestamptz ORDER BY i; + i +------------ + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 +(4 rows) + +SELECT * FROM test_date WHERE i='2004-10-26'::timestamptz ORDER BY i; + i +------------ + 10-26-2004 +(1 row) + +SELECT * FROM test_date WHERE i>='2004-10-26'::timestamptz ORDER BY i; + i +------------ + 10-26-2004 + 10-27-2004 + 10-28-2004 +(3 rows) + +SELECT * FROM test_date WHERE i>'2004-10-26'::timestamptz ORDER BY i; + i +------------ + 10-27-2004 + 10-28-2004 +(2 rows) + +-- Check endpoint and out-of-range cases +INSERT INTO test_date VALUES ('-infinity'), ('infinity'); +SELECT gin_clean_pending_list('idx_date'); + gin_clean_pending_list +------------------------ + 1 +(1 row) + +SELECT * FROM test_date WHERE i<'-infinity'::timestamp ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_date WHERE i<='-infinity'::timestamp ORDER BY i; + i +----------- + -infinity +(1 row) + +SELECT * FROM test_date WHERE i='-infinity'::timestamp ORDER BY i; + i +----------- + -infinity +(1 row) + +SELECT * FROM test_date WHERE i>='-infinity'::timestamp ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(8 rows) + +SELECT * FROM test_date WHERE i>'-infinity'::timestamp ORDER BY i; + i +------------ + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(7 rows) + +SELECT * FROM test_date WHERE i<'infinity'::timestamp ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 + 10-27-2004 + 10-28-2004 +(7 rows) + +SELECT * FROM test_date WHERE i<='infinity'::timestamp ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(8 rows) + +SELECT * FROM test_date WHERE i='infinity'::timestamp ORDER BY i; + i +---------- + infinity +(1 row) + +SELECT * FROM test_date WHERE i>='infinity'::timestamp ORDER BY i; + i +---------- + infinity +(1 row) + +SELECT * FROM test_date WHERE i>'infinity'::timestamp ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_date WHERE i<'-infinity'::timestamptz ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_date WHERE i<='-infinity'::timestamptz ORDER BY i; + i +----------- + -infinity +(1 row) + +SELECT * FROM test_date WHERE i='-infinity'::timestamptz ORDER BY i; + i +----------- + -infinity +(1 row) + +SELECT * FROM test_date WHERE i>='-infinity'::timestamptz ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(8 rows) + +SELECT * FROM test_date WHERE i>'-infinity'::timestamptz ORDER BY i; + i +------------ + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(7 rows) + +SELECT * FROM test_date WHERE i<'infinity'::timestamptz ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 + 10-27-2004 + 10-28-2004 +(7 rows) + +SELECT * FROM test_date WHERE i<='infinity'::timestamptz ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(8 rows) + +SELECT * FROM test_date WHERE i='infinity'::timestamptz ORDER BY i; + i +---------- + infinity +(1 row) + +SELECT * FROM test_date WHERE i>='infinity'::timestamptz ORDER BY i; + i +---------- + infinity +(1 row) + +SELECT * FROM test_date WHERE i>'infinity'::timestamptz ORDER BY i; + i +--- +(0 rows) + +-- Check rounding cases +-- '2004-10-25 00:00:01' rounds to '2004-10-25' for date. +-- '2004-10-25 23:59:59' also rounds to '2004-10-25', +-- so it's the same case as '2004-10-25 00:00:01' +SELECT * FROM test_date WHERE i < '2004-10-25 00:00:01'::timestamp ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 +(4 rows) + +SELECT * FROM test_date WHERE i <= '2004-10-25 00:00:01'::timestamp ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 +(4 rows) + +SELECT * FROM test_date WHERE i = '2004-10-25 00:00:01'::timestamp ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_date WHERE i > '2004-10-25 00:00:01'::timestamp ORDER BY i; + i +------------ + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(4 rows) + +SELECT * FROM test_date WHERE i >= '2004-10-25 00:00:01'::timestamp ORDER BY i; + i +------------ + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(4 rows) + +SELECT * FROM test_date WHERE i < '2004-10-25 00:00:01'::timestamptz ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 +(4 rows) + +SELECT * FROM test_date WHERE i <= '2004-10-25 00:00:01'::timestamptz ORDER BY i; + i +------------ + -infinity + 10-23-2004 + 10-24-2004 + 10-25-2004 +(4 rows) + +SELECT * FROM test_date WHERE i = '2004-10-25 00:00:01'::timestamptz ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_date WHERE i > '2004-10-25 00:00:01'::timestamptz ORDER BY i; + i +------------ + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(4 rows) + +SELECT * FROM test_date WHERE i >= '2004-10-25 00:00:01'::timestamptz ORDER BY i; + i +------------ + 10-26-2004 + 10-27-2004 + 10-28-2004 + infinity +(4 rows) + diff --git a/contrib/btree_gin/expected/float4.out b/contrib/btree_gin/expected/float4.out index 7b9134fcd4bdc..c8bb04e59be9b 100644 --- a/contrib/btree_gin/expected/float4.out +++ b/contrib/btree_gin/expected/float4.out @@ -42,3 +42,324 @@ SELECT * FROM test_float4 WHERE i>1::float4 ORDER BY i; 3 (2 rows) +explain (costs off) +SELECT * FROM test_float4 WHERE i<1::float8 ORDER BY i; + QUERY PLAN +------------------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_float4 + Recheck Cond: (i < '1'::double precision) + -> Bitmap Index Scan on idx_float4 + Index Cond: (i < '1'::double precision) +(6 rows) + +SELECT * FROM test_float4 WHERE i<1::float8 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_float4 WHERE i<=1::float8 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_float4 WHERE i=1::float8 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_float4 WHERE i>=1::float8 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_float4 WHERE i>1::float8 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + +-- Check endpoint and out-of-range cases +INSERT INTO test_float4 VALUES ('NaN'), ('Inf'), ('-Inf'); +SELECT gin_clean_pending_list('idx_float4'); + gin_clean_pending_list +------------------------ + 1 +(1 row) + +SELECT * FROM test_float4 WHERE i<'-Inf'::float8 ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_float4 WHERE i<='-Inf'::float8 ORDER BY i; + i +----------- + -Infinity +(1 row) + +SELECT * FROM test_float4 WHERE i='-Inf'::float8 ORDER BY i; + i +----------- + -Infinity +(1 row) + +SELECT * FROM test_float4 WHERE i>='-Inf'::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 + 0 + 1 + 2 + 3 + Infinity + NaN +(9 rows) + +SELECT * FROM test_float4 WHERE i>'-Inf'::float8 ORDER BY i; + i +---------- + -2 + -1 + 0 + 1 + 2 + 3 + Infinity + NaN +(8 rows) + +SELECT * FROM test_float4 WHERE i<'Inf'::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 + 0 + 1 + 2 + 3 +(7 rows) + +SELECT * FROM test_float4 WHERE i<='Inf'::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 + 0 + 1 + 2 + 3 + Infinity +(8 rows) + +SELECT * FROM test_float4 WHERE i='Inf'::float8 ORDER BY i; + i +---------- + Infinity +(1 row) + +SELECT * FROM test_float4 WHERE i>='Inf'::float8 ORDER BY i; + i +---------- + Infinity + NaN +(2 rows) + +SELECT * FROM test_float4 WHERE i>'Inf'::float8 ORDER BY i; + i +----- + NaN +(1 row) + +SELECT * FROM test_float4 WHERE i<'1e300'::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 + 0 + 1 + 2 + 3 +(7 rows) + +SELECT * FROM test_float4 WHERE i<='1e300'::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 + 0 + 1 + 2 + 3 +(7 rows) + +SELECT * FROM test_float4 WHERE i='1e300'::float8 ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_float4 WHERE i>='1e300'::float8 ORDER BY i; + i +---------- + Infinity + NaN +(2 rows) + +SELECT * FROM test_float4 WHERE i>'1e300'::float8 ORDER BY i; + i +---------- + Infinity + NaN +(2 rows) + +SELECT * FROM test_float4 WHERE i<'NaN'::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 + 0 + 1 + 2 + 3 + Infinity +(8 rows) + +SELECT * FROM test_float4 WHERE i<='NaN'::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 + 0 + 1 + 2 + 3 + Infinity + NaN +(9 rows) + +SELECT * FROM test_float4 WHERE i='NaN'::float8 ORDER BY i; + i +----- + NaN +(1 row) + +SELECT * FROM test_float4 WHERE i>='NaN'::float8 ORDER BY i; + i +----- + NaN +(1 row) + +SELECT * FROM test_float4 WHERE i>'NaN'::float8 ORDER BY i; + i +--- +(0 rows) + +-- Check rounding cases +-- 1e-300 rounds to 0 for float4 but not for float8 +SELECT * FROM test_float4 WHERE i < -1e-300::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 +(3 rows) + +SELECT * FROM test_float4 WHERE i <= -1e-300::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 +(3 rows) + +SELECT * FROM test_float4 WHERE i = -1e-300::float8 ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_float4 WHERE i > -1e-300::float8 ORDER BY i; + i +---------- + 0 + 1 + 2 + 3 + Infinity + NaN +(6 rows) + +SELECT * FROM test_float4 WHERE i >= -1e-300::float8 ORDER BY i; + i +---------- + 0 + 1 + 2 + 3 + Infinity + NaN +(6 rows) + +SELECT * FROM test_float4 WHERE i < 1e-300::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 + 0 +(4 rows) + +SELECT * FROM test_float4 WHERE i <= 1e-300::float8 ORDER BY i; + i +----------- + -Infinity + -2 + -1 + 0 +(4 rows) + +SELECT * FROM test_float4 WHERE i = 1e-300::float8 ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_float4 WHERE i > 1e-300::float8 ORDER BY i; + i +---------- + 1 + 2 + 3 + Infinity + NaN +(5 rows) + +SELECT * FROM test_float4 WHERE i >= 1e-300::float8 ORDER BY i; + i +---------- + 1 + 2 + 3 + Infinity + NaN +(5 rows) + diff --git a/contrib/btree_gin/expected/float8.out b/contrib/btree_gin/expected/float8.out index a41d4f9f6bb05..b2877dfa3c1c2 100644 --- a/contrib/btree_gin/expected/float8.out +++ b/contrib/btree_gin/expected/float8.out @@ -42,3 +42,53 @@ SELECT * FROM test_float8 WHERE i>1::float8 ORDER BY i; 3 (2 rows) +explain (costs off) +SELECT * FROM test_float8 WHERE i<1::float4 ORDER BY i; + QUERY PLAN +--------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_float8 + Recheck Cond: (i < '1'::real) + -> Bitmap Index Scan on idx_float8 + Index Cond: (i < '1'::real) +(6 rows) + +SELECT * FROM test_float8 WHERE i<1::float4 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_float8 WHERE i<=1::float4 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_float8 WHERE i=1::float4 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_float8 WHERE i>=1::float4 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_float8 WHERE i>1::float4 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + diff --git a/contrib/btree_gin/expected/int2.out b/contrib/btree_gin/expected/int2.out index 20d66a1b05545..bcfa68f671a25 100644 --- a/contrib/btree_gin/expected/int2.out +++ b/contrib/btree_gin/expected/int2.out @@ -42,3 +42,193 @@ SELECT * FROM test_int2 WHERE i>1::int2 ORDER BY i; 3 (2 rows) +explain (costs off) +SELECT * FROM test_int2 WHERE i<1::int4 ORDER BY i; + QUERY PLAN +------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_int2 + Recheck Cond: (i < 1) + -> Bitmap Index Scan on idx_int2 + Index Cond: (i < 1) +(6 rows) + +SELECT * FROM test_int2 WHERE i<1::int4 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_int2 WHERE i<=1::int4 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_int2 WHERE i=1::int4 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_int2 WHERE i>=1::int4 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_int2 WHERE i>1::int4 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + +explain (costs off) +SELECT * FROM test_int2 WHERE i<1::int8 ORDER BY i; + QUERY PLAN +--------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_int2 + Recheck Cond: (i < '1'::bigint) + -> Bitmap Index Scan on idx_int2 + Index Cond: (i < '1'::bigint) +(6 rows) + +SELECT * FROM test_int2 WHERE i<1::int8 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_int2 WHERE i<=1::int8 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_int2 WHERE i=1::int8 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_int2 WHERE i>=1::int8 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_int2 WHERE i>1::int8 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + +-- Check endpoint and out-of-range cases +INSERT INTO test_int2 VALUES ((-32768)::int2),(32767); +SELECT gin_clean_pending_list('idx_int2'); + gin_clean_pending_list +------------------------ + 1 +(1 row) + +SELECT * FROM test_int2 WHERE i<(-32769)::int4 ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_int2 WHERE i<=(-32769)::int4 ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_int2 WHERE i=(-32769)::int4 ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_int2 WHERE i>=(-32769)::int4 ORDER BY i; + i +-------- + -32768 + -2 + -1 + 0 + 1 + 2 + 3 + 32767 +(8 rows) + +SELECT * FROM test_int2 WHERE i>(-32769)::int4 ORDER BY i; + i +-------- + -32768 + -2 + -1 + 0 + 1 + 2 + 3 + 32767 +(8 rows) + +SELECT * FROM test_int2 WHERE i<32768::int4 ORDER BY i; + i +-------- + -32768 + -2 + -1 + 0 + 1 + 2 + 3 + 32767 +(8 rows) + +SELECT * FROM test_int2 WHERE i<=32768::int4 ORDER BY i; + i +-------- + -32768 + -2 + -1 + 0 + 1 + 2 + 3 + 32767 +(8 rows) + +SELECT * FROM test_int2 WHERE i=32768::int4 ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_int2 WHERE i>=32768::int4 ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_int2 WHERE i>32768::int4 ORDER BY i; + i +--- +(0 rows) + diff --git a/contrib/btree_gin/expected/int4.out b/contrib/btree_gin/expected/int4.out index 0f0122c6f5e03..e62791e18bdc2 100644 --- a/contrib/btree_gin/expected/int4.out +++ b/contrib/btree_gin/expected/int4.out @@ -42,3 +42,103 @@ SELECT * FROM test_int4 WHERE i>1::int4 ORDER BY i; 3 (2 rows) +explain (costs off) +SELECT * FROM test_int4 WHERE i<1::int2 ORDER BY i; + QUERY PLAN +----------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_int4 + Recheck Cond: (i < '1'::smallint) + -> Bitmap Index Scan on idx_int4 + Index Cond: (i < '1'::smallint) +(6 rows) + +SELECT * FROM test_int4 WHERE i<1::int2 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_int4 WHERE i<=1::int2 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_int4 WHERE i=1::int2 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_int4 WHERE i>=1::int2 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_int4 WHERE i>1::int2 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + +explain (costs off) +SELECT * FROM test_int4 WHERE i<1::int8 ORDER BY i; + QUERY PLAN +--------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_int4 + Recheck Cond: (i < '1'::bigint) + -> Bitmap Index Scan on idx_int4 + Index Cond: (i < '1'::bigint) +(6 rows) + +SELECT * FROM test_int4 WHERE i<1::int8 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_int4 WHERE i<=1::int8 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_int4 WHERE i=1::int8 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_int4 WHERE i>=1::int8 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_int4 WHERE i>1::int8 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + diff --git a/contrib/btree_gin/expected/int8.out b/contrib/btree_gin/expected/int8.out index 307e19e7a056d..c9aceb9d357c6 100644 --- a/contrib/btree_gin/expected/int8.out +++ b/contrib/btree_gin/expected/int8.out @@ -42,3 +42,103 @@ SELECT * FROM test_int8 WHERE i>1::int8 ORDER BY i; 3 (2 rows) +explain (costs off) +SELECT * FROM test_int8 WHERE i<1::int2 ORDER BY i; + QUERY PLAN +----------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_int8 + Recheck Cond: (i < '1'::smallint) + -> Bitmap Index Scan on idx_int8 + Index Cond: (i < '1'::smallint) +(6 rows) + +SELECT * FROM test_int8 WHERE i<1::int2 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_int8 WHERE i<=1::int2 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_int8 WHERE i=1::int2 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_int8 WHERE i>=1::int2 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_int8 WHERE i>1::int2 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + +explain (costs off) +SELECT * FROM test_int8 WHERE i<1::int4 ORDER BY i; + QUERY PLAN +------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_int8 + Recheck Cond: (i < 1) + -> Bitmap Index Scan on idx_int8 + Index Cond: (i < 1) +(6 rows) + +SELECT * FROM test_int8 WHERE i<1::int4 ORDER BY i; + i +---- + -2 + -1 + 0 +(3 rows) + +SELECT * FROM test_int8 WHERE i<=1::int4 ORDER BY i; + i +---- + -2 + -1 + 0 + 1 +(4 rows) + +SELECT * FROM test_int8 WHERE i=1::int4 ORDER BY i; + i +--- + 1 +(1 row) + +SELECT * FROM test_int8 WHERE i>=1::int4 ORDER BY i; + i +--- + 1 + 2 + 3 +(3 rows) + +SELECT * FROM test_int8 WHERE i>1::int4 ORDER BY i; + i +--- + 2 + 3 +(2 rows) + diff --git a/contrib/btree_gin/expected/name.out b/contrib/btree_gin/expected/name.out index 174de6576f0f0..3a30f62519c67 100644 --- a/contrib/btree_gin/expected/name.out +++ b/contrib/btree_gin/expected/name.out @@ -95,3 +95,62 @@ EXPLAIN (COSTS OFF) SELECT * FROM test_name WHERE i>'abc' ORDER BY i; Index Cond: (i > 'abc'::name) (6 rows) +explain (costs off) +SELECT * FROM test_name WHERE i<'abc'::text ORDER BY i; + QUERY PLAN +--------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_name + Recheck Cond: (i < 'abc'::text) + -> Bitmap Index Scan on idx_name + Index Cond: (i < 'abc'::text) +(6 rows) + +SELECT * FROM test_name WHERE i<'abc'::text ORDER BY i; + i +----- + a + ab + abb +(3 rows) + +SELECT * FROM test_name WHERE i<='abc'::text ORDER BY i; + i +----- + a + ab + abb + abc +(4 rows) + +SELECT * FROM test_name WHERE i='abc'::text ORDER BY i; + i +----- + abc +(1 row) + +SELECT * FROM test_name WHERE i>='abc'::text ORDER BY i; + i +----- + abc + axy + xyz +(3 rows) + +SELECT * FROM test_name WHERE i>'abc'::text ORDER BY i; + i +----- + axy + xyz +(2 rows) + +SELECT * FROM test_name WHERE i<=repeat('abc', 100) ORDER BY i; + i +----- + a + ab + abb + abc +(4 rows) + diff --git a/contrib/btree_gin/expected/text.out b/contrib/btree_gin/expected/text.out index 3e31ad744d6aa..7f52f3db7b38e 100644 --- a/contrib/btree_gin/expected/text.out +++ b/contrib/btree_gin/expected/text.out @@ -42,3 +42,53 @@ SELECT * FROM test_text WHERE i>'abc' ORDER BY i; xyz (2 rows) +explain (costs off) +SELECT * FROM test_text WHERE i<'abc'::name COLLATE "default" ORDER BY i; + QUERY PLAN +--------------------------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_text + Recheck Cond: (i < 'abc'::name COLLATE "default") + -> Bitmap Index Scan on idx_text + Index Cond: (i < 'abc'::name COLLATE "default") +(6 rows) + +SELECT * FROM test_text WHERE i<'abc'::name COLLATE "default" ORDER BY i; + i +----- + a + ab + abb +(3 rows) + +SELECT * FROM test_text WHERE i<='abc'::name COLLATE "default" ORDER BY i; + i +----- + a + ab + abb + abc +(4 rows) + +SELECT * FROM test_text WHERE i='abc'::name COLLATE "default" ORDER BY i; + i +----- + abc +(1 row) + +SELECT * FROM test_text WHERE i>='abc'::name COLLATE "default" ORDER BY i; + i +----- + abc + axy + xyz +(3 rows) + +SELECT * FROM test_text WHERE i>'abc'::name COLLATE "default" ORDER BY i; + i +----- + axy + xyz +(2 rows) + diff --git a/contrib/btree_gin/expected/timestamp.out b/contrib/btree_gin/expected/timestamp.out index a236cdc94a9d2..b7565285e68ba 100644 --- a/contrib/btree_gin/expected/timestamp.out +++ b/contrib/btree_gin/expected/timestamp.out @@ -7,8 +7,8 @@ INSERT INTO test_timestamp VALUES ( '2004-10-26 04:55:08' ), ( '2004-10-26 05:55:08' ), ( '2004-10-26 08:55:08' ), - ( '2004-10-26 09:55:08' ), - ( '2004-10-26 10:55:08' ) + ( '2004-10-27 09:55:08' ), + ( '2004-10-27 10:55:08' ) ; CREATE INDEX idx_timestamp ON test_timestamp USING gin (i); SELECT * FROM test_timestamp WHERE i<'2004-10-26 08:55:08'::timestamp ORDER BY i; @@ -38,14 +38,308 @@ SELECT * FROM test_timestamp WHERE i>='2004-10-26 08:55:08'::timestamp ORDER BY i -------------------------- Tue Oct 26 08:55:08 2004 - Tue Oct 26 09:55:08 2004 - Tue Oct 26 10:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 (3 rows) SELECT * FROM test_timestamp WHERE i>'2004-10-26 08:55:08'::timestamp ORDER BY i; i -------------------------- - Tue Oct 26 09:55:08 2004 - Tue Oct 26 10:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 (2 rows) +explain (costs off) +SELECT * FROM test_timestamp WHERE i<'2004-10-27'::date ORDER BY i; + QUERY PLAN +---------------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_timestamp + Recheck Cond: (i < '10-27-2004'::date) + -> Bitmap Index Scan on idx_timestamp + Index Cond: (i < '10-27-2004'::date) +(6 rows) + +SELECT * FROM test_timestamp WHERE i<'2004-10-27'::date ORDER BY i; + i +-------------------------- + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 +(4 rows) + +SELECT * FROM test_timestamp WHERE i<='2004-10-27'::date ORDER BY i; + i +-------------------------- + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 +(4 rows) + +SELECT * FROM test_timestamp WHERE i='2004-10-27'::date ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_timestamp WHERE i>='2004-10-27'::date ORDER BY i; + i +-------------------------- + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 +(2 rows) + +SELECT * FROM test_timestamp WHERE i>'2004-10-27'::date ORDER BY i; + i +-------------------------- + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 +(2 rows) + +explain (costs off) +SELECT * FROM test_timestamp WHERE i<'2004-10-26 08:55:08'::timestamptz ORDER BY i; + QUERY PLAN +------------------------------------------------------------------------------------------ + Sort + Sort Key: i + -> Bitmap Heap Scan on test_timestamp + Recheck Cond: (i < 'Tue Oct 26 08:55:08 2004 PDT'::timestamp with time zone) + -> Bitmap Index Scan on idx_timestamp + Index Cond: (i < 'Tue Oct 26 08:55:08 2004 PDT'::timestamp with time zone) +(6 rows) + +SELECT * FROM test_timestamp WHERE i<'2004-10-26 08:55:08'::timestamptz ORDER BY i; + i +-------------------------- + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 +(3 rows) + +SELECT * FROM test_timestamp WHERE i<='2004-10-26 08:55:08'::timestamptz ORDER BY i; + i +-------------------------- + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 +(4 rows) + +SELECT * FROM test_timestamp WHERE i='2004-10-26 08:55:08'::timestamptz ORDER BY i; + i +-------------------------- + Tue Oct 26 08:55:08 2004 +(1 row) + +SELECT * FROM test_timestamp WHERE i>='2004-10-26 08:55:08'::timestamptz ORDER BY i; + i +-------------------------- + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 +(3 rows) + +SELECT * FROM test_timestamp WHERE i>'2004-10-26 08:55:08'::timestamptz ORDER BY i; + i +-------------------------- + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 +(2 rows) + +-- Check endpoint and out-of-range cases +INSERT INTO test_timestamp VALUES ('-infinity'), ('infinity'); +SELECT gin_clean_pending_list('idx_timestamp'); + gin_clean_pending_list +------------------------ + 1 +(1 row) + +SELECT * FROM test_timestamp WHERE i<'-infinity'::date ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_timestamp WHERE i<='-infinity'::date ORDER BY i; + i +----------- + -infinity +(1 row) + +SELECT * FROM test_timestamp WHERE i='-infinity'::date ORDER BY i; + i +----------- + -infinity +(1 row) + +SELECT * FROM test_timestamp WHERE i>='-infinity'::date ORDER BY i; + i +-------------------------- + -infinity + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 + infinity +(8 rows) + +SELECT * FROM test_timestamp WHERE i>'-infinity'::date ORDER BY i; + i +-------------------------- + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 + infinity +(7 rows) + +SELECT * FROM test_timestamp WHERE i<'infinity'::date ORDER BY i; + i +-------------------------- + -infinity + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 +(7 rows) + +SELECT * FROM test_timestamp WHERE i<='infinity'::date ORDER BY i; + i +-------------------------- + -infinity + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 + infinity +(8 rows) + +SELECT * FROM test_timestamp WHERE i='infinity'::date ORDER BY i; + i +---------- + infinity +(1 row) + +SELECT * FROM test_timestamp WHERE i>='infinity'::date ORDER BY i; + i +---------- + infinity +(1 row) + +SELECT * FROM test_timestamp WHERE i>'infinity'::date ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_timestamp WHERE i<'-infinity'::timestamptz ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_timestamp WHERE i<='-infinity'::timestamptz ORDER BY i; + i +----------- + -infinity +(1 row) + +SELECT * FROM test_timestamp WHERE i='-infinity'::timestamptz ORDER BY i; + i +----------- + -infinity +(1 row) + +SELECT * FROM test_timestamp WHERE i>='-infinity'::timestamptz ORDER BY i; + i +-------------------------- + -infinity + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 + infinity +(8 rows) + +SELECT * FROM test_timestamp WHERE i>'-infinity'::timestamptz ORDER BY i; + i +-------------------------- + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 + infinity +(7 rows) + +SELECT * FROM test_timestamp WHERE i<'infinity'::timestamptz ORDER BY i; + i +-------------------------- + -infinity + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 +(7 rows) + +SELECT * FROM test_timestamp WHERE i<='infinity'::timestamptz ORDER BY i; + i +-------------------------- + -infinity + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 + infinity +(8 rows) + +SELECT * FROM test_timestamp WHERE i='infinity'::timestamptz ORDER BY i; + i +---------- + infinity +(1 row) + +SELECT * FROM test_timestamp WHERE i>='infinity'::timestamptz ORDER BY i; + i +---------- + infinity +(1 row) + +SELECT * FROM test_timestamp WHERE i>'infinity'::timestamptz ORDER BY i; + i +--- +(0 rows) + +-- This PST timestamptz will underflow if converted to timestamp +SELECT * FROM test_timestamp WHERE i<='4714-11-23 17:00 BC'::timestamptz ORDER BY i; + i +----------- + -infinity +(1 row) + +SELECT * FROM test_timestamp WHERE i>'4714-11-23 17:00 BC'::timestamptz ORDER BY i; + i +-------------------------- + Tue Oct 26 03:55:08 2004 + Tue Oct 26 04:55:08 2004 + Tue Oct 26 05:55:08 2004 + Tue Oct 26 08:55:08 2004 + Wed Oct 27 09:55:08 2004 + Wed Oct 27 10:55:08 2004 + infinity +(7 rows) + diff --git a/contrib/btree_gin/expected/timestamptz.out b/contrib/btree_gin/expected/timestamptz.out index d53963d2a04b8..0dada0b662cbb 100644 --- a/contrib/btree_gin/expected/timestamptz.out +++ b/contrib/btree_gin/expected/timestamptz.out @@ -7,8 +7,8 @@ INSERT INTO test_timestamptz VALUES ( '2004-10-26 04:55:08' ), ( '2004-10-26 05:55:08' ), ( '2004-10-26 08:55:08' ), - ( '2004-10-26 09:55:08' ), - ( '2004-10-26 10:55:08' ) + ( '2004-10-27 09:55:08' ), + ( '2004-10-27 10:55:08' ) ; CREATE INDEX idx_timestamptz ON test_timestamptz USING gin (i); SELECT * FROM test_timestamptz WHERE i<'2004-10-26 08:55:08'::timestamptz ORDER BY i; @@ -38,14 +38,113 @@ SELECT * FROM test_timestamptz WHERE i>='2004-10-26 08:55:08'::timestamptz ORDER i ------------------------------ Tue Oct 26 08:55:08 2004 PDT - Tue Oct 26 09:55:08 2004 PDT - Tue Oct 26 10:55:08 2004 PDT + Wed Oct 27 09:55:08 2004 PDT + Wed Oct 27 10:55:08 2004 PDT (3 rows) SELECT * FROM test_timestamptz WHERE i>'2004-10-26 08:55:08'::timestamptz ORDER BY i; i ------------------------------ - Tue Oct 26 09:55:08 2004 PDT - Tue Oct 26 10:55:08 2004 PDT + Wed Oct 27 09:55:08 2004 PDT + Wed Oct 27 10:55:08 2004 PDT +(2 rows) + +explain (costs off) +SELECT * FROM test_timestamptz WHERE i<'2004-10-27'::date ORDER BY i; + QUERY PLAN +---------------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_timestamptz + Recheck Cond: (i < '10-27-2004'::date) + -> Bitmap Index Scan on idx_timestamptz + Index Cond: (i < '10-27-2004'::date) +(6 rows) + +SELECT * FROM test_timestamptz WHERE i<'2004-10-27'::date ORDER BY i; + i +------------------------------ + Tue Oct 26 03:55:08 2004 PDT + Tue Oct 26 04:55:08 2004 PDT + Tue Oct 26 05:55:08 2004 PDT + Tue Oct 26 08:55:08 2004 PDT +(4 rows) + +SELECT * FROM test_timestamptz WHERE i<='2004-10-27'::date ORDER BY i; + i +------------------------------ + Tue Oct 26 03:55:08 2004 PDT + Tue Oct 26 04:55:08 2004 PDT + Tue Oct 26 05:55:08 2004 PDT + Tue Oct 26 08:55:08 2004 PDT +(4 rows) + +SELECT * FROM test_timestamptz WHERE i='2004-10-27'::date ORDER BY i; + i +--- +(0 rows) + +SELECT * FROM test_timestamptz WHERE i>='2004-10-27'::date ORDER BY i; + i +------------------------------ + Wed Oct 27 09:55:08 2004 PDT + Wed Oct 27 10:55:08 2004 PDT +(2 rows) + +SELECT * FROM test_timestamptz WHERE i>'2004-10-27'::date ORDER BY i; + i +------------------------------ + Wed Oct 27 09:55:08 2004 PDT + Wed Oct 27 10:55:08 2004 PDT +(2 rows) + +explain (costs off) +SELECT * FROM test_timestamptz WHERE i<'2004-10-26 08:55:08'::timestamp ORDER BY i; + QUERY PLAN +----------------------------------------------------------------------------------------- + Sort + Sort Key: i + -> Bitmap Heap Scan on test_timestamptz + Recheck Cond: (i < 'Tue Oct 26 08:55:08 2004'::timestamp without time zone) + -> Bitmap Index Scan on idx_timestamptz + Index Cond: (i < 'Tue Oct 26 08:55:08 2004'::timestamp without time zone) +(6 rows) + +SELECT * FROM test_timestamptz WHERE i<'2004-10-26 08:55:08'::timestamp ORDER BY i; + i +------------------------------ + Tue Oct 26 03:55:08 2004 PDT + Tue Oct 26 04:55:08 2004 PDT + Tue Oct 26 05:55:08 2004 PDT +(3 rows) + +SELECT * FROM test_timestamptz WHERE i<='2004-10-26 08:55:08'::timestamp ORDER BY i; + i +------------------------------ + Tue Oct 26 03:55:08 2004 PDT + Tue Oct 26 04:55:08 2004 PDT + Tue Oct 26 05:55:08 2004 PDT + Tue Oct 26 08:55:08 2004 PDT +(4 rows) + +SELECT * FROM test_timestamptz WHERE i='2004-10-26 08:55:08'::timestamp ORDER BY i; + i +------------------------------ + Tue Oct 26 08:55:08 2004 PDT +(1 row) + +SELECT * FROM test_timestamptz WHERE i>='2004-10-26 08:55:08'::timestamp ORDER BY i; + i +------------------------------ + Tue Oct 26 08:55:08 2004 PDT + Wed Oct 27 09:55:08 2004 PDT + Wed Oct 27 10:55:08 2004 PDT +(3 rows) + +SELECT * FROM test_timestamptz WHERE i>'2004-10-26 08:55:08'::timestamp ORDER BY i; + i +------------------------------ + Wed Oct 27 09:55:08 2004 PDT + Wed Oct 27 10:55:08 2004 PDT (2 rows) diff --git a/contrib/btree_gin/meson.build b/contrib/btree_gin/meson.build index b2749f6e66951..ece0a716973ce 100644 --- a/contrib/btree_gin/meson.build +++ b/contrib/btree_gin/meson.build @@ -22,6 +22,7 @@ install_data( 'btree_gin--1.0--1.1.sql', 'btree_gin--1.1--1.2.sql', 'btree_gin--1.2--1.3.sql', + 'btree_gin--1.3--1.4.sql', kwargs: contrib_data_args, ) diff --git a/contrib/btree_gin/sql/date.sql b/contrib/btree_gin/sql/date.sql index 35086f6b81b9b..006f6f528b835 100644 --- a/contrib/btree_gin/sql/date.sql +++ b/contrib/btree_gin/sql/date.sql @@ -20,3 +20,67 @@ SELECT * FROM test_date WHERE i<='2004-10-26'::date ORDER BY i; SELECT * FROM test_date WHERE i='2004-10-26'::date ORDER BY i; SELECT * FROM test_date WHERE i>='2004-10-26'::date ORDER BY i; SELECT * FROM test_date WHERE i>'2004-10-26'::date ORDER BY i; + +explain (costs off) +SELECT * FROM test_date WHERE i<'2004-10-26'::timestamp ORDER BY i; + +SELECT * FROM test_date WHERE i<'2004-10-26'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i<='2004-10-26'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i='2004-10-26'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i>='2004-10-26'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i>'2004-10-26'::timestamp ORDER BY i; + +explain (costs off) +SELECT * FROM test_date WHERE i<'2004-10-26'::timestamptz ORDER BY i; + +SELECT * FROM test_date WHERE i<'2004-10-26'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i<='2004-10-26'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i='2004-10-26'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i>='2004-10-26'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i>'2004-10-26'::timestamptz ORDER BY i; + +-- Check endpoint and out-of-range cases + +INSERT INTO test_date VALUES ('-infinity'), ('infinity'); +SELECT gin_clean_pending_list('idx_date'); + +SELECT * FROM test_date WHERE i<'-infinity'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i<='-infinity'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i='-infinity'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i>='-infinity'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i>'-infinity'::timestamp ORDER BY i; + +SELECT * FROM test_date WHERE i<'infinity'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i<='infinity'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i='infinity'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i>='infinity'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i>'infinity'::timestamp ORDER BY i; + +SELECT * FROM test_date WHERE i<'-infinity'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i<='-infinity'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i='-infinity'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i>='-infinity'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i>'-infinity'::timestamptz ORDER BY i; + +SELECT * FROM test_date WHERE i<'infinity'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i<='infinity'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i='infinity'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i>='infinity'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i>'infinity'::timestamptz ORDER BY i; + +-- Check rounding cases +-- '2004-10-25 00:00:01' rounds to '2004-10-25' for date. +-- '2004-10-25 23:59:59' also rounds to '2004-10-25', +-- so it's the same case as '2004-10-25 00:00:01' + +SELECT * FROM test_date WHERE i < '2004-10-25 00:00:01'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i <= '2004-10-25 00:00:01'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i = '2004-10-25 00:00:01'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i > '2004-10-25 00:00:01'::timestamp ORDER BY i; +SELECT * FROM test_date WHERE i >= '2004-10-25 00:00:01'::timestamp ORDER BY i; + +SELECT * FROM test_date WHERE i < '2004-10-25 00:00:01'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i <= '2004-10-25 00:00:01'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i = '2004-10-25 00:00:01'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i > '2004-10-25 00:00:01'::timestamptz ORDER BY i; +SELECT * FROM test_date WHERE i >= '2004-10-25 00:00:01'::timestamptz ORDER BY i; diff --git a/contrib/btree_gin/sql/float4.sql b/contrib/btree_gin/sql/float4.sql index 759778ad3c3b4..0707ed6518fa2 100644 --- a/contrib/btree_gin/sql/float4.sql +++ b/contrib/btree_gin/sql/float4.sql @@ -13,3 +13,56 @@ SELECT * FROM test_float4 WHERE i<=1::float4 ORDER BY i; SELECT * FROM test_float4 WHERE i=1::float4 ORDER BY i; SELECT * FROM test_float4 WHERE i>=1::float4 ORDER BY i; SELECT * FROM test_float4 WHERE i>1::float4 ORDER BY i; + +explain (costs off) +SELECT * FROM test_float4 WHERE i<1::float8 ORDER BY i; + +SELECT * FROM test_float4 WHERE i<1::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i<=1::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i=1::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>=1::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>1::float8 ORDER BY i; + +-- Check endpoint and out-of-range cases + +INSERT INTO test_float4 VALUES ('NaN'), ('Inf'), ('-Inf'); +SELECT gin_clean_pending_list('idx_float4'); + +SELECT * FROM test_float4 WHERE i<'-Inf'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i<='-Inf'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i='-Inf'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>='-Inf'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>'-Inf'::float8 ORDER BY i; + +SELECT * FROM test_float4 WHERE i<'Inf'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i<='Inf'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i='Inf'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>='Inf'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>'Inf'::float8 ORDER BY i; + +SELECT * FROM test_float4 WHERE i<'1e300'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i<='1e300'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i='1e300'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>='1e300'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>'1e300'::float8 ORDER BY i; + +SELECT * FROM test_float4 WHERE i<'NaN'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i<='NaN'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i='NaN'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>='NaN'::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i>'NaN'::float8 ORDER BY i; + +-- Check rounding cases +-- 1e-300 rounds to 0 for float4 but not for float8 + +SELECT * FROM test_float4 WHERE i < -1e-300::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i <= -1e-300::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i = -1e-300::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i > -1e-300::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i >= -1e-300::float8 ORDER BY i; + +SELECT * FROM test_float4 WHERE i < 1e-300::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i <= 1e-300::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i = 1e-300::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i > 1e-300::float8 ORDER BY i; +SELECT * FROM test_float4 WHERE i >= 1e-300::float8 ORDER BY i; diff --git a/contrib/btree_gin/sql/float8.sql b/contrib/btree_gin/sql/float8.sql index b046ac4e6c4bb..5f393147082b1 100644 --- a/contrib/btree_gin/sql/float8.sql +++ b/contrib/btree_gin/sql/float8.sql @@ -13,3 +13,12 @@ SELECT * FROM test_float8 WHERE i<=1::float8 ORDER BY i; SELECT * FROM test_float8 WHERE i=1::float8 ORDER BY i; SELECT * FROM test_float8 WHERE i>=1::float8 ORDER BY i; SELECT * FROM test_float8 WHERE i>1::float8 ORDER BY i; + +explain (costs off) +SELECT * FROM test_float8 WHERE i<1::float4 ORDER BY i; + +SELECT * FROM test_float8 WHERE i<1::float4 ORDER BY i; +SELECT * FROM test_float8 WHERE i<=1::float4 ORDER BY i; +SELECT * FROM test_float8 WHERE i=1::float4 ORDER BY i; +SELECT * FROM test_float8 WHERE i>=1::float4 ORDER BY i; +SELECT * FROM test_float8 WHERE i>1::float4 ORDER BY i; diff --git a/contrib/btree_gin/sql/int2.sql b/contrib/btree_gin/sql/int2.sql index f06f11702f54e..959e0f6cfde01 100644 --- a/contrib/btree_gin/sql/int2.sql +++ b/contrib/btree_gin/sql/int2.sql @@ -13,3 +13,38 @@ SELECT * FROM test_int2 WHERE i<=1::int2 ORDER BY i; SELECT * FROM test_int2 WHERE i=1::int2 ORDER BY i; SELECT * FROM test_int2 WHERE i>=1::int2 ORDER BY i; SELECT * FROM test_int2 WHERE i>1::int2 ORDER BY i; + +explain (costs off) +SELECT * FROM test_int2 WHERE i<1::int4 ORDER BY i; + +SELECT * FROM test_int2 WHERE i<1::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i<=1::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i=1::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i>=1::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i>1::int4 ORDER BY i; + +explain (costs off) +SELECT * FROM test_int2 WHERE i<1::int8 ORDER BY i; + +SELECT * FROM test_int2 WHERE i<1::int8 ORDER BY i; +SELECT * FROM test_int2 WHERE i<=1::int8 ORDER BY i; +SELECT * FROM test_int2 WHERE i=1::int8 ORDER BY i; +SELECT * FROM test_int2 WHERE i>=1::int8 ORDER BY i; +SELECT * FROM test_int2 WHERE i>1::int8 ORDER BY i; + +-- Check endpoint and out-of-range cases + +INSERT INTO test_int2 VALUES ((-32768)::int2),(32767); +SELECT gin_clean_pending_list('idx_int2'); + +SELECT * FROM test_int2 WHERE i<(-32769)::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i<=(-32769)::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i=(-32769)::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i>=(-32769)::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i>(-32769)::int4 ORDER BY i; + +SELECT * FROM test_int2 WHERE i<32768::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i<=32768::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i=32768::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i>=32768::int4 ORDER BY i; +SELECT * FROM test_int2 WHERE i>32768::int4 ORDER BY i; diff --git a/contrib/btree_gin/sql/int4.sql b/contrib/btree_gin/sql/int4.sql index 6499c29630722..9a45530b63ad7 100644 --- a/contrib/btree_gin/sql/int4.sql +++ b/contrib/btree_gin/sql/int4.sql @@ -13,3 +13,21 @@ SELECT * FROM test_int4 WHERE i<=1::int4 ORDER BY i; SELECT * FROM test_int4 WHERE i=1::int4 ORDER BY i; SELECT * FROM test_int4 WHERE i>=1::int4 ORDER BY i; SELECT * FROM test_int4 WHERE i>1::int4 ORDER BY i; + +explain (costs off) +SELECT * FROM test_int4 WHERE i<1::int2 ORDER BY i; + +SELECT * FROM test_int4 WHERE i<1::int2 ORDER BY i; +SELECT * FROM test_int4 WHERE i<=1::int2 ORDER BY i; +SELECT * FROM test_int4 WHERE i=1::int2 ORDER BY i; +SELECT * FROM test_int4 WHERE i>=1::int2 ORDER BY i; +SELECT * FROM test_int4 WHERE i>1::int2 ORDER BY i; + +explain (costs off) +SELECT * FROM test_int4 WHERE i<1::int8 ORDER BY i; + +SELECT * FROM test_int4 WHERE i<1::int8 ORDER BY i; +SELECT * FROM test_int4 WHERE i<=1::int8 ORDER BY i; +SELECT * FROM test_int4 WHERE i=1::int8 ORDER BY i; +SELECT * FROM test_int4 WHERE i>=1::int8 ORDER BY i; +SELECT * FROM test_int4 WHERE i>1::int8 ORDER BY i; diff --git a/contrib/btree_gin/sql/int8.sql b/contrib/btree_gin/sql/int8.sql index 4d9c2871814c4..b31f27c69b90a 100644 --- a/contrib/btree_gin/sql/int8.sql +++ b/contrib/btree_gin/sql/int8.sql @@ -13,3 +13,21 @@ SELECT * FROM test_int8 WHERE i<=1::int8 ORDER BY i; SELECT * FROM test_int8 WHERE i=1::int8 ORDER BY i; SELECT * FROM test_int8 WHERE i>=1::int8 ORDER BY i; SELECT * FROM test_int8 WHERE i>1::int8 ORDER BY i; + +explain (costs off) +SELECT * FROM test_int8 WHERE i<1::int2 ORDER BY i; + +SELECT * FROM test_int8 WHERE i<1::int2 ORDER BY i; +SELECT * FROM test_int8 WHERE i<=1::int2 ORDER BY i; +SELECT * FROM test_int8 WHERE i=1::int2 ORDER BY i; +SELECT * FROM test_int8 WHERE i>=1::int2 ORDER BY i; +SELECT * FROM test_int8 WHERE i>1::int2 ORDER BY i; + +explain (costs off) +SELECT * FROM test_int8 WHERE i<1::int4 ORDER BY i; + +SELECT * FROM test_int8 WHERE i<1::int4 ORDER BY i; +SELECT * FROM test_int8 WHERE i<=1::int4 ORDER BY i; +SELECT * FROM test_int8 WHERE i=1::int4 ORDER BY i; +SELECT * FROM test_int8 WHERE i>=1::int4 ORDER BY i; +SELECT * FROM test_int8 WHERE i>1::int4 ORDER BY i; diff --git a/contrib/btree_gin/sql/name.sql b/contrib/btree_gin/sql/name.sql index c11580cdf9609..551d928940746 100644 --- a/contrib/btree_gin/sql/name.sql +++ b/contrib/btree_gin/sql/name.sql @@ -19,3 +19,14 @@ EXPLAIN (COSTS OFF) SELECT * FROM test_name WHERE i<='abc' ORDER BY i; EXPLAIN (COSTS OFF) SELECT * FROM test_name WHERE i='abc' ORDER BY i; EXPLAIN (COSTS OFF) SELECT * FROM test_name WHERE i>='abc' ORDER BY i; EXPLAIN (COSTS OFF) SELECT * FROM test_name WHERE i>'abc' ORDER BY i; + +explain (costs off) +SELECT * FROM test_name WHERE i<'abc'::text ORDER BY i; + +SELECT * FROM test_name WHERE i<'abc'::text ORDER BY i; +SELECT * FROM test_name WHERE i<='abc'::text ORDER BY i; +SELECT * FROM test_name WHERE i='abc'::text ORDER BY i; +SELECT * FROM test_name WHERE i>='abc'::text ORDER BY i; +SELECT * FROM test_name WHERE i>'abc'::text ORDER BY i; + +SELECT * FROM test_name WHERE i<=repeat('abc', 100) ORDER BY i; diff --git a/contrib/btree_gin/sql/text.sql b/contrib/btree_gin/sql/text.sql index d5b3b39898988..978b21376fd85 100644 --- a/contrib/btree_gin/sql/text.sql +++ b/contrib/btree_gin/sql/text.sql @@ -13,3 +13,12 @@ SELECT * FROM test_text WHERE i<='abc' ORDER BY i; SELECT * FROM test_text WHERE i='abc' ORDER BY i; SELECT * FROM test_text WHERE i>='abc' ORDER BY i; SELECT * FROM test_text WHERE i>'abc' ORDER BY i; + +explain (costs off) +SELECT * FROM test_text WHERE i<'abc'::name COLLATE "default" ORDER BY i; + +SELECT * FROM test_text WHERE i<'abc'::name COLLATE "default" ORDER BY i; +SELECT * FROM test_text WHERE i<='abc'::name COLLATE "default" ORDER BY i; +SELECT * FROM test_text WHERE i='abc'::name COLLATE "default" ORDER BY i; +SELECT * FROM test_text WHERE i>='abc'::name COLLATE "default" ORDER BY i; +SELECT * FROM test_text WHERE i>'abc'::name COLLATE "default" ORDER BY i; diff --git a/contrib/btree_gin/sql/timestamp.sql b/contrib/btree_gin/sql/timestamp.sql index 56727e81c4aff..1ee4edb5ea4d2 100644 --- a/contrib/btree_gin/sql/timestamp.sql +++ b/contrib/btree_gin/sql/timestamp.sql @@ -9,8 +9,8 @@ INSERT INTO test_timestamp VALUES ( '2004-10-26 04:55:08' ), ( '2004-10-26 05:55:08' ), ( '2004-10-26 08:55:08' ), - ( '2004-10-26 09:55:08' ), - ( '2004-10-26 10:55:08' ) + ( '2004-10-27 09:55:08' ), + ( '2004-10-27 10:55:08' ) ; CREATE INDEX idx_timestamp ON test_timestamp USING gin (i); @@ -20,3 +20,54 @@ SELECT * FROM test_timestamp WHERE i<='2004-10-26 08:55:08'::timestamp ORDER BY SELECT * FROM test_timestamp WHERE i='2004-10-26 08:55:08'::timestamp ORDER BY i; SELECT * FROM test_timestamp WHERE i>='2004-10-26 08:55:08'::timestamp ORDER BY i; SELECT * FROM test_timestamp WHERE i>'2004-10-26 08:55:08'::timestamp ORDER BY i; + +explain (costs off) +SELECT * FROM test_timestamp WHERE i<'2004-10-27'::date ORDER BY i; + +SELECT * FROM test_timestamp WHERE i<'2004-10-27'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i<='2004-10-27'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i='2004-10-27'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i>='2004-10-27'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i>'2004-10-27'::date ORDER BY i; + +explain (costs off) +SELECT * FROM test_timestamp WHERE i<'2004-10-26 08:55:08'::timestamptz ORDER BY i; + +SELECT * FROM test_timestamp WHERE i<'2004-10-26 08:55:08'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i<='2004-10-26 08:55:08'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i='2004-10-26 08:55:08'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i>='2004-10-26 08:55:08'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i>'2004-10-26 08:55:08'::timestamptz ORDER BY i; + +-- Check endpoint and out-of-range cases + +INSERT INTO test_timestamp VALUES ('-infinity'), ('infinity'); +SELECT gin_clean_pending_list('idx_timestamp'); + +SELECT * FROM test_timestamp WHERE i<'-infinity'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i<='-infinity'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i='-infinity'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i>='-infinity'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i>'-infinity'::date ORDER BY i; + +SELECT * FROM test_timestamp WHERE i<'infinity'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i<='infinity'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i='infinity'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i>='infinity'::date ORDER BY i; +SELECT * FROM test_timestamp WHERE i>'infinity'::date ORDER BY i; + +SELECT * FROM test_timestamp WHERE i<'-infinity'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i<='-infinity'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i='-infinity'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i>='-infinity'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i>'-infinity'::timestamptz ORDER BY i; + +SELECT * FROM test_timestamp WHERE i<'infinity'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i<='infinity'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i='infinity'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i>='infinity'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i>'infinity'::timestamptz ORDER BY i; + +-- This PST timestamptz will underflow if converted to timestamp +SELECT * FROM test_timestamp WHERE i<='4714-11-23 17:00 BC'::timestamptz ORDER BY i; +SELECT * FROM test_timestamp WHERE i>'4714-11-23 17:00 BC'::timestamptz ORDER BY i; diff --git a/contrib/btree_gin/sql/timestamptz.sql b/contrib/btree_gin/sql/timestamptz.sql index e6cfdb1b07447..40d2d7ed329d2 100644 --- a/contrib/btree_gin/sql/timestamptz.sql +++ b/contrib/btree_gin/sql/timestamptz.sql @@ -9,8 +9,8 @@ INSERT INTO test_timestamptz VALUES ( '2004-10-26 04:55:08' ), ( '2004-10-26 05:55:08' ), ( '2004-10-26 08:55:08' ), - ( '2004-10-26 09:55:08' ), - ( '2004-10-26 10:55:08' ) + ( '2004-10-27 09:55:08' ), + ( '2004-10-27 10:55:08' ) ; CREATE INDEX idx_timestamptz ON test_timestamptz USING gin (i); @@ -20,3 +20,21 @@ SELECT * FROM test_timestamptz WHERE i<='2004-10-26 08:55:08'::timestamptz ORDER SELECT * FROM test_timestamptz WHERE i='2004-10-26 08:55:08'::timestamptz ORDER BY i; SELECT * FROM test_timestamptz WHERE i>='2004-10-26 08:55:08'::timestamptz ORDER BY i; SELECT * FROM test_timestamptz WHERE i>'2004-10-26 08:55:08'::timestamptz ORDER BY i; + +explain (costs off) +SELECT * FROM test_timestamptz WHERE i<'2004-10-27'::date ORDER BY i; + +SELECT * FROM test_timestamptz WHERE i<'2004-10-27'::date ORDER BY i; +SELECT * FROM test_timestamptz WHERE i<='2004-10-27'::date ORDER BY i; +SELECT * FROM test_timestamptz WHERE i='2004-10-27'::date ORDER BY i; +SELECT * FROM test_timestamptz WHERE i>='2004-10-27'::date ORDER BY i; +SELECT * FROM test_timestamptz WHERE i>'2004-10-27'::date ORDER BY i; + +explain (costs off) +SELECT * FROM test_timestamptz WHERE i<'2004-10-26 08:55:08'::timestamp ORDER BY i; + +SELECT * FROM test_timestamptz WHERE i<'2004-10-26 08:55:08'::timestamp ORDER BY i; +SELECT * FROM test_timestamptz WHERE i<='2004-10-26 08:55:08'::timestamp ORDER BY i; +SELECT * FROM test_timestamptz WHERE i='2004-10-26 08:55:08'::timestamp ORDER BY i; +SELECT * FROM test_timestamptz WHERE i>='2004-10-26 08:55:08'::timestamp ORDER BY i; +SELECT * FROM test_timestamptz WHERE i>'2004-10-26 08:55:08'::timestamp ORDER BY i; diff --git a/contrib/btree_gist/Makefile b/contrib/btree_gist/Makefile index 68190ac5e4687..7ac2df26c1044 100644 --- a/contrib/btree_gist/Makefile +++ b/contrib/btree_gist/Makefile @@ -34,7 +34,7 @@ DATA = btree_gist--1.0--1.1.sql \ btree_gist--1.1--1.2.sql btree_gist--1.2.sql btree_gist--1.2--1.3.sql \ btree_gist--1.3--1.4.sql btree_gist--1.4--1.5.sql \ btree_gist--1.5--1.6.sql btree_gist--1.6--1.7.sql \ - btree_gist--1.7--1.8.sql btree_gist--1.8--1.9.sql + btree_gist--1.7--1.8.sql PGFILEDESC = "btree_gist - B-tree equivalent GiST operator classes" REGRESS = init int2 int4 int8 float4 float8 cash oid timestamp timestamptz \ diff --git a/contrib/btree_gist/btree_gist--1.7--1.8.sql b/contrib/btree_gist/btree_gist--1.7--1.8.sql index 8f79365a461f8..22316dc3f566c 100644 --- a/contrib/btree_gist/btree_gist--1.7--1.8.sql +++ b/contrib/btree_gist/btree_gist--1.7--1.8.sql @@ -3,6 +3,203 @@ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "ALTER EXTENSION btree_gist UPDATE TO '1.8'" to load this file. \quit +-- Add sortsupport functions + +CREATE FUNCTION gbt_bit_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_varbit_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_bool_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_bytea_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_cash_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_date_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_enum_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_float4_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_float8_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_inet_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_int2_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_int4_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_int8_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_intv_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_macaddr_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_macad8_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_numeric_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_oid_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_text_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_bpchar_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_time_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_ts_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +CREATE FUNCTION gbt_uuid_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; + +ALTER OPERATOR FAMILY gist_bit_ops USING gist ADD + FUNCTION 11 (bit, bit) gbt_bit_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_vbit_ops USING gist ADD + FUNCTION 11 (varbit, varbit) gbt_varbit_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_bool_ops USING gist ADD + FUNCTION 11 (bool, bool) gbt_bool_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_bytea_ops USING gist ADD + FUNCTION 11 (bytea, bytea) gbt_bytea_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_cash_ops USING gist ADD + FUNCTION 11 (money, money) gbt_cash_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_date_ops USING gist ADD + FUNCTION 11 (date, date) gbt_date_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_enum_ops USING gist ADD + FUNCTION 11 (anyenum, anyenum) gbt_enum_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_float4_ops USING gist ADD + FUNCTION 11 (float4, float4) gbt_float4_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_float8_ops USING gist ADD + FUNCTION 11 (float8, float8) gbt_float8_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_inet_ops USING gist ADD + FUNCTION 11 (inet, inet) gbt_inet_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_cidr_ops USING gist ADD + FUNCTION 11 (cidr, cidr) gbt_inet_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_int2_ops USING gist ADD + FUNCTION 11 (int2, int2) gbt_int2_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_int4_ops USING gist ADD + FUNCTION 11 (int4, int4) gbt_int4_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_int8_ops USING gist ADD + FUNCTION 11 (int8, int8) gbt_int8_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_interval_ops USING gist ADD + FUNCTION 11 (interval, interval) gbt_intv_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_macaddr_ops USING gist ADD + FUNCTION 11 (macaddr, macaddr) gbt_macaddr_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_macaddr8_ops USING gist ADD + FUNCTION 11 (macaddr8, macaddr8) gbt_macad8_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_numeric_ops USING gist ADD + FUNCTION 11 (numeric, numeric) gbt_numeric_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_oid_ops USING gist ADD + FUNCTION 11 (oid, oid) gbt_oid_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_text_ops USING gist ADD + FUNCTION 11 (text, text) gbt_text_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_bpchar_ops USING gist ADD + FUNCTION 11 (bpchar, bpchar) gbt_bpchar_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_time_ops USING gist ADD + FUNCTION 11 (time, time) gbt_time_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_timetz_ops USING gist ADD + FUNCTION 11 (timetz, timetz) gbt_time_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_timestamp_ops USING gist ADD + FUNCTION 11 (timestamp, timestamp) gbt_ts_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_timestamptz_ops USING gist ADD + FUNCTION 11 (timestamptz, timestamptz) gbt_ts_sortsupport (internal) ; + +ALTER OPERATOR FAMILY gist_uuid_ops USING gist ADD + FUNCTION 11 (uuid, uuid) gbt_uuid_sortsupport (internal) ; + +-- Add translate_cmptype functions + CREATE FUNCTION gist_translate_cmptype_btree(int) RETURNS smallint AS 'MODULE_PATHNAME' diff --git a/contrib/btree_gist/btree_gist--1.8--1.9.sql b/contrib/btree_gist/btree_gist--1.8--1.9.sql deleted file mode 100644 index 4b38749bf5f34..0000000000000 --- a/contrib/btree_gist/btree_gist--1.8--1.9.sql +++ /dev/null @@ -1,197 +0,0 @@ -/* contrib/btree_gist/btree_gist--1.7--1.8.sql */ - --- complain if script is sourced in psql, rather than via CREATE EXTENSION -\echo Use "ALTER EXTENSION btree_gist UPDATE TO '1.9'" to load this file. \quit - -CREATE FUNCTION gbt_bit_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_varbit_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_bool_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_bytea_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_cash_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_date_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_enum_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_float4_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_float8_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_inet_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_int2_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_int4_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_int8_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_intv_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_macaddr_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_macad8_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_numeric_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_oid_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_text_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_bpchar_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_time_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_ts_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -CREATE FUNCTION gbt_uuid_sortsupport(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE PARALLEL SAFE STRICT; - -ALTER OPERATOR FAMILY gist_bit_ops USING gist ADD - FUNCTION 11 (bit, bit) gbt_bit_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_vbit_ops USING gist ADD - FUNCTION 11 (varbit, varbit) gbt_varbit_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_bool_ops USING gist ADD - FUNCTION 11 (bool, bool) gbt_bool_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_bytea_ops USING gist ADD - FUNCTION 11 (bytea, bytea) gbt_bytea_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_cash_ops USING gist ADD - FUNCTION 11 (money, money) gbt_cash_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_date_ops USING gist ADD - FUNCTION 11 (date, date) gbt_date_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_enum_ops USING gist ADD - FUNCTION 11 (anyenum, anyenum) gbt_enum_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_float4_ops USING gist ADD - FUNCTION 11 (float4, float4) gbt_float4_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_float8_ops USING gist ADD - FUNCTION 11 (float8, float8) gbt_float8_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_inet_ops USING gist ADD - FUNCTION 11 (inet, inet) gbt_inet_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_cidr_ops USING gist ADD - FUNCTION 11 (cidr, cidr) gbt_inet_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_int2_ops USING gist ADD - FUNCTION 11 (int2, int2) gbt_int2_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_int4_ops USING gist ADD - FUNCTION 11 (int4, int4) gbt_int4_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_int8_ops USING gist ADD - FUNCTION 11 (int8, int8) gbt_int8_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_interval_ops USING gist ADD - FUNCTION 11 (interval, interval) gbt_intv_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_macaddr_ops USING gist ADD - FUNCTION 11 (macaddr, macaddr) gbt_macaddr_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_macaddr8_ops USING gist ADD - FUNCTION 11 (macaddr8, macaddr8) gbt_macad8_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_numeric_ops USING gist ADD - FUNCTION 11 (numeric, numeric) gbt_numeric_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_oid_ops USING gist ADD - FUNCTION 11 (oid, oid) gbt_oid_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_text_ops USING gist ADD - FUNCTION 11 (text, text) gbt_text_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_bpchar_ops USING gist ADD - FUNCTION 11 (bpchar, bpchar) gbt_bpchar_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_time_ops USING gist ADD - FUNCTION 11 (time, time) gbt_time_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_timetz_ops USING gist ADD - FUNCTION 11 (timetz, timetz) gbt_time_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_timestamp_ops USING gist ADD - FUNCTION 11 (timestamp, timestamp) gbt_ts_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_timestamptz_ops USING gist ADD - FUNCTION 11 (timestamptz, timestamptz) gbt_ts_sortsupport (internal) ; - -ALTER OPERATOR FAMILY gist_uuid_ops USING gist ADD - FUNCTION 11 (uuid, uuid) gbt_uuid_sortsupport (internal) ; diff --git a/contrib/btree_gist/btree_gist.control b/contrib/btree_gist/btree_gist.control index 69d9341a0adea..abf66538f3244 100644 --- a/contrib/btree_gist/btree_gist.control +++ b/contrib/btree_gist/btree_gist.control @@ -1,6 +1,6 @@ # btree_gist extension comment = 'support for indexing common datatypes in GiST' -default_version = '1.9' +default_version = '1.8' module_pathname = '$libdir/btree_gist' relocatable = true trusted = true diff --git a/contrib/btree_gist/meson.build b/contrib/btree_gist/meson.build index 89932dd3844ee..f4fa9574f1fd7 100644 --- a/contrib/btree_gist/meson.build +++ b/contrib/btree_gist/meson.build @@ -51,7 +51,6 @@ install_data( 'btree_gist--1.5--1.6.sql', 'btree_gist--1.6--1.7.sql', 'btree_gist--1.7--1.8.sql', - 'btree_gist--1.8--1.9.sql', kwargs: contrib_data_args, ) diff --git a/contrib/dblink/dblink.c b/contrib/dblink/dblink.c index 8a0b112a7ff29..f98805fb5f735 100644 --- a/contrib/dblink/dblink.c +++ b/contrib/dblink/dblink.c @@ -101,8 +101,8 @@ static void materializeQueryResult(FunctionCallInfo fcinfo, const char *conname, const char *sql, bool fail); -static PGresult *storeQueryResult(volatile storeInfo *sinfo, PGconn *conn, const char *sql); -static void storeRow(volatile storeInfo *sinfo, PGresult *res, bool first); +static PGresult *storeQueryResult(storeInfo *sinfo, PGconn *conn, const char *sql); +static void storeRow(storeInfo *sinfo, PGresult *res, bool first); static remoteConn *getConnectionByName(const char *name); static HTAB *createConnHash(void); static remoteConn *createNewConnection(const char *name); @@ -169,14 +169,6 @@ typedef struct remoteConnHashEnt /* initial number of connection hashes */ #define NUMCONN 16 -static char * -xpstrdup(const char *in) -{ - if (in == NULL) - return NULL; - return pstrdup(in); -} - pg_noreturn static void dblink_res_internalerror(PGconn *conn, PGresult *res, const char *p2) { @@ -240,6 +232,10 @@ dblink_get_conn(char *conname_or_str, errmsg("could not establish connection"), errdetail_internal("%s", msg))); } + + PQsetNoticeReceiver(conn, libpqsrv_notice_receiver, + "received message via remote connection"); + dblink_security_check(conn, NULL, connstr); if (PQclientEncoding(conn) != GetDatabaseEncoding()) PQsetClientEncoding(conn, GetDatabaseEncodingName()); @@ -338,6 +334,9 @@ dblink_connect(PG_FUNCTION_ARGS) errdetail_internal("%s", msg))); } + PQsetNoticeReceiver(conn, libpqsrv_notice_receiver, + "received message via remote connection"); + /* check password actually used if not superuser */ dblink_security_check(conn, connname, connstr); @@ -863,131 +862,123 @@ static void materializeResult(FunctionCallInfo fcinfo, PGconn *conn, PGresult *res) { ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + TupleDesc tupdesc; + bool is_sql_cmd; + int ntuples; + int nfields; /* prepTuplestoreResult must have been called previously */ Assert(rsinfo->returnMode == SFRM_Materialize); - PG_TRY(); + if (PQresultStatus(res) == PGRES_COMMAND_OK) { - TupleDesc tupdesc; - bool is_sql_cmd; - int ntuples; - int nfields; + is_sql_cmd = true; - if (PQresultStatus(res) == PGRES_COMMAND_OK) - { - is_sql_cmd = true; + /* + * need a tuple descriptor representing one TEXT column to return the + * command status string as our result tuple + */ + tupdesc = CreateTemplateTupleDesc(1); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "status", + TEXTOID, -1, 0); + ntuples = 1; + nfields = 1; + } + else + { + Assert(PQresultStatus(res) == PGRES_TUPLES_OK); - /* - * need a tuple descriptor representing one TEXT column to return - * the command status string as our result tuple - */ - tupdesc = CreateTemplateTupleDesc(1); - TupleDescInitEntry(tupdesc, (AttrNumber) 1, "status", - TEXTOID, -1, 0); - ntuples = 1; - nfields = 1; - } - else - { - Assert(PQresultStatus(res) == PGRES_TUPLES_OK); + is_sql_cmd = false; - is_sql_cmd = false; + /* get a tuple descriptor for our result type */ + switch (get_call_result_type(fcinfo, NULL, &tupdesc)) + { + case TYPEFUNC_COMPOSITE: + /* success */ + break; + case TYPEFUNC_RECORD: + /* failed to determine actual type of RECORD */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("function returning record called in context " + "that cannot accept type record"))); + break; + default: + /* result type isn't composite */ + elog(ERROR, "return type must be a row type"); + break; + } - /* get a tuple descriptor for our result type */ - switch (get_call_result_type(fcinfo, NULL, &tupdesc)) - { - case TYPEFUNC_COMPOSITE: - /* success */ - break; - case TYPEFUNC_RECORD: - /* failed to determine actual type of RECORD */ - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("function returning record called in context " - "that cannot accept type record"))); - break; - default: - /* result type isn't composite */ - elog(ERROR, "return type must be a row type"); - break; - } + /* make sure we have a persistent copy of the tupdesc */ + tupdesc = CreateTupleDescCopy(tupdesc); + ntuples = PQntuples(res); + nfields = PQnfields(res); + } - /* make sure we have a persistent copy of the tupdesc */ - tupdesc = CreateTupleDescCopy(tupdesc); - ntuples = PQntuples(res); - nfields = PQnfields(res); - } + /* + * check result and tuple descriptor have the same number of columns + */ + if (nfields != tupdesc->natts) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("remote query result rowtype does not match " + "the specified FROM clause rowtype"))); - /* - * check result and tuple descriptor have the same number of columns - */ - if (nfields != tupdesc->natts) - ereport(ERROR, - (errcode(ERRCODE_DATATYPE_MISMATCH), - errmsg("remote query result rowtype does not match " - "the specified FROM clause rowtype"))); + if (ntuples > 0) + { + AttInMetadata *attinmeta; + int nestlevel = -1; + Tuplestorestate *tupstore; + MemoryContext oldcontext; + int row; + char **values; - if (ntuples > 0) - { - AttInMetadata *attinmeta; - int nestlevel = -1; - Tuplestorestate *tupstore; - MemoryContext oldcontext; - int row; - char **values; + attinmeta = TupleDescGetAttInMetadata(tupdesc); - attinmeta = TupleDescGetAttInMetadata(tupdesc); + /* Set GUCs to ensure we read GUC-sensitive data types correctly */ + if (!is_sql_cmd) + nestlevel = applyRemoteGucs(conn); - /* Set GUCs to ensure we read GUC-sensitive data types correctly */ - if (!is_sql_cmd) - nestlevel = applyRemoteGucs(conn); + oldcontext = MemoryContextSwitchTo(rsinfo->econtext->ecxt_per_query_memory); + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + MemoryContextSwitchTo(oldcontext); - oldcontext = MemoryContextSwitchTo(rsinfo->econtext->ecxt_per_query_memory); - tupstore = tuplestore_begin_heap(true, false, work_mem); - rsinfo->setResult = tupstore; - rsinfo->setDesc = tupdesc; - MemoryContextSwitchTo(oldcontext); + values = palloc_array(char *, nfields); - values = palloc_array(char *, nfields); + /* put all tuples into the tuplestore */ + for (row = 0; row < ntuples; row++) + { + HeapTuple tuple; - /* put all tuples into the tuplestore */ - for (row = 0; row < ntuples; row++) + if (!is_sql_cmd) { - HeapTuple tuple; + int i; - if (!is_sql_cmd) + for (i = 0; i < nfields; i++) { - int i; - - for (i = 0; i < nfields; i++) - { - if (PQgetisnull(res, row, i)) - values[i] = NULL; - else - values[i] = PQgetvalue(res, row, i); - } + if (PQgetisnull(res, row, i)) + values[i] = NULL; + else + values[i] = PQgetvalue(res, row, i); } - else - { - values[0] = PQcmdStatus(res); - } - - /* build the tuple and put it into the tuplestore. */ - tuple = BuildTupleFromCStrings(attinmeta, values); - tuplestore_puttuple(tupstore, tuple); + } + else + { + values[0] = PQcmdStatus(res); } - /* clean up GUC settings, if we changed any */ - restoreLocalGucs(nestlevel); + /* build the tuple and put it into the tuplestore. */ + tuple = BuildTupleFromCStrings(attinmeta, values); + tuplestore_puttuple(tupstore, tuple); } + + /* clean up GUC settings, if we changed any */ + restoreLocalGucs(nestlevel); } - PG_FINALLY(); - { - /* be sure to release the libpq result */ - PQclear(res); - } - PG_END_TRY(); + + PQclear(res); } /* @@ -1006,16 +997,17 @@ materializeQueryResult(FunctionCallInfo fcinfo, bool fail) { ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; - PGresult *volatile res = NULL; - volatile storeInfo sinfo = {0}; /* prepTuplestoreResult must have been called previously */ Assert(rsinfo->returnMode == SFRM_Materialize); - sinfo.fcinfo = fcinfo; - + /* Use a PG_TRY block to ensure we pump libpq dry of results */ PG_TRY(); { + storeInfo sinfo = {0}; + PGresult *res; + + sinfo.fcinfo = fcinfo; /* Create short-lived memory context for data conversions */ sinfo.tmpcontext = AllocSetContextCreate(CurrentMemoryContext, "dblink temporary context", @@ -1028,14 +1020,7 @@ materializeQueryResult(FunctionCallInfo fcinfo, (PQresultStatus(res) != PGRES_COMMAND_OK && PQresultStatus(res) != PGRES_TUPLES_OK)) { - /* - * dblink_res_error will clear the passed PGresult, so we need - * this ugly dance to avoid doing so twice during error exit - */ - PGresult *res1 = res; - - res = NULL; - dblink_res_error(conn, conname, res1, fail, + dblink_res_error(conn, conname, res, fail, "while executing query"); /* if fail isn't set, we'll return an empty query result */ } @@ -1074,7 +1059,6 @@ materializeQueryResult(FunctionCallInfo fcinfo, tuplestore_puttuple(tupstore, tuple); PQclear(res); - res = NULL; } else { @@ -1083,26 +1067,20 @@ materializeQueryResult(FunctionCallInfo fcinfo, Assert(rsinfo->setResult != NULL); PQclear(res); - res = NULL; } /* clean up data conversion short-lived memory context */ if (sinfo.tmpcontext != NULL) MemoryContextDelete(sinfo.tmpcontext); - sinfo.tmpcontext = NULL; PQclear(sinfo.last_res); - sinfo.last_res = NULL; PQclear(sinfo.cur_res); - sinfo.cur_res = NULL; } PG_CATCH(); { - /* be sure to release any libpq result we collected */ - PQclear(res); - PQclear(sinfo.last_res); - PQclear(sinfo.cur_res); - /* and clear out any pending data in libpq */ + PGresult *res; + + /* be sure to clear out any pending data in libpq */ while ((res = libpqsrv_get_result(conn, dblink_we_get_result)) != NULL) PQclear(res); @@ -1115,7 +1093,7 @@ materializeQueryResult(FunctionCallInfo fcinfo, * Execute query, and send any result rows to sinfo->tuplestore. */ static PGresult * -storeQueryResult(volatile storeInfo *sinfo, PGconn *conn, const char *sql) +storeQueryResult(storeInfo *sinfo, PGconn *conn, const char *sql) { bool first = true; int nestlevel = -1; @@ -1183,7 +1161,7 @@ storeQueryResult(volatile storeInfo *sinfo, PGconn *conn, const char *sql) * (in this case the PGresult might contain either zero or one row). */ static void -storeRow(volatile storeInfo *sinfo, PGresult *res, bool first) +storeRow(storeInfo *sinfo, PGresult *res, bool first) { int nfields = PQnfields(res); HeapTuple tuple; @@ -2788,10 +2766,13 @@ dblink_connstr_check(const char *connstr) /* * Report an error received from the remote server * - * res: the received error result (will be freed) + * res: the received error result * fail: true for ERROR ereport, false for NOTICE * fmt and following args: sprintf-style format and values for errcontext; * the resulting string should be worded like "while " + * + * If "res" is not NULL, it'll be PQclear'ed here (unless we throw error, + * in which case memory context cleanup will clear it eventually). */ static void dblink_res_error(PGconn *conn, const char *conname, PGresult *res, @@ -2799,15 +2780,11 @@ dblink_res_error(PGconn *conn, const char *conname, PGresult *res, { int level; char *pg_diag_sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE); - char *pg_diag_message_primary = PQresultErrorField(res, PG_DIAG_MESSAGE_PRIMARY); - char *pg_diag_message_detail = PQresultErrorField(res, PG_DIAG_MESSAGE_DETAIL); - char *pg_diag_message_hint = PQresultErrorField(res, PG_DIAG_MESSAGE_HINT); - char *pg_diag_context = PQresultErrorField(res, PG_DIAG_CONTEXT); + char *message_primary = PQresultErrorField(res, PG_DIAG_MESSAGE_PRIMARY); + char *message_detail = PQresultErrorField(res, PG_DIAG_MESSAGE_DETAIL); + char *message_hint = PQresultErrorField(res, PG_DIAG_MESSAGE_HINT); + char *message_context = PQresultErrorField(res, PG_DIAG_CONTEXT); int sqlstate; - char *message_primary; - char *message_detail; - char *message_hint; - char *message_context; va_list ap; char dblink_context_msg[512]; @@ -2825,11 +2802,6 @@ dblink_res_error(PGconn *conn, const char *conname, PGresult *res, else sqlstate = ERRCODE_CONNECTION_FAILURE; - message_primary = xpstrdup(pg_diag_message_primary); - message_detail = xpstrdup(pg_diag_message_detail); - message_hint = xpstrdup(pg_diag_message_hint); - message_context = xpstrdup(pg_diag_context); - /* * If we don't get a message from the PGresult, try the PGconn. This is * needed because for connection-level failures, PQgetResult may just @@ -2838,14 +2810,6 @@ dblink_res_error(PGconn *conn, const char *conname, PGresult *res, if (message_primary == NULL) message_primary = pchomp(PQerrorMessage(conn)); - /* - * Now that we've copied all the data we need out of the PGresult, it's - * safe to free it. We must do this to avoid PGresult leakage. We're - * leaking all the strings too, but those are in palloc'd memory that will - * get cleaned up eventually. - */ - PQclear(res); - /* * Format the basic errcontext string. Below, we'll add on something * about the connection name. That's a violation of the translatability @@ -2870,6 +2834,7 @@ dblink_res_error(PGconn *conn, const char *conname, PGresult *res, dblink_context_msg, conname)) : (errcontext("%s on unnamed dblink connection", dblink_context_msg)))); + PQclear(res); } /* diff --git a/contrib/dblink/meson.build b/contrib/dblink/meson.build index dfd8eb6877e90..a19ce6cf4b924 100644 --- a/contrib/dblink/meson.build +++ b/contrib/dblink/meson.build @@ -34,7 +34,7 @@ tests += { 'sql': [ 'dblink', ], - 'regress_args': ['--dlpath', meson.build_root() / 'src/test/regress'], + 'regress_args': ['--dlpath', meson.project_build_root() / 'src/test/regress'], }, 'tap': { 'tests': [ diff --git a/contrib/isn/isn.c b/contrib/isn/isn.c index 038c8ed4db7bd..1880c91844e90 100644 --- a/contrib/isn/isn.c +++ b/contrib/isn/isn.c @@ -726,7 +726,7 @@ string2ean(const char *str, struct Node *escontext, ean13 *result, if (type != INVALID) goto eaninvalid; type = ISSN; - *aux1++ = toupper((unsigned char) *aux2); + *aux1++ = pg_ascii_toupper((unsigned char) *aux2); length++; } else if (length == 9 && (digit || *aux2 == 'X' || *aux2 == 'x') && last) @@ -736,7 +736,7 @@ string2ean(const char *str, struct Node *escontext, ean13 *result, goto eaninvalid; if (type == INVALID) type = ISBN; /* ISMN must start with 'M' */ - *aux1++ = toupper((unsigned char) *aux2); + *aux1++ = pg_ascii_toupper((unsigned char) *aux2); length++; } else if (length == 11 && digit && last) diff --git a/contrib/pageinspect/expected/gist.out b/contrib/pageinspect/expected/gist.out index 2b1d54a627949..8502f9efb4190 100644 --- a/contrib/pageinspect/expected/gist.out +++ b/contrib/pageinspect/expected/gist.out @@ -5,21 +5,21 @@ CREATE UNLOGGED TABLE test_gist AS SELECT point(i,i) p, i::text t FROM CREATE INDEX test_gist_idx ON test_gist USING gist (p); -- Page 0 is the root, the rest are leaf pages SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0)); - lsn | nsn | rightlink | flags ------+-----+------------+------- - 0/1 | 0/0 | 4294967295 | {} + lsn | nsn | rightlink | flags +------------+------------+------------+------- + 0/00000001 | 0/00000000 | 4294967295 | {} (1 row) SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1)); - lsn | nsn | rightlink | flags ------+-----+------------+-------- - 0/1 | 0/0 | 4294967295 | {leaf} + lsn | nsn | rightlink | flags +------------+------------+------------+-------- + 0/00000001 | 0/00000000 | 4294967295 | {leaf} (1 row) SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2)); - lsn | nsn | rightlink | flags ------+-----+-----------+-------- - 0/1 | 0/0 | 1 | {leaf} + lsn | nsn | rightlink | flags +------------+------------+-----------+-------- + 0/00000001 | 0/00000000 | 1 | {leaf} (1 row) SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx'); diff --git a/contrib/pageinspect/expected/page.out b/contrib/pageinspect/expected/page.out index e42fd9747fd1c..fcf19c5ca5a50 100644 --- a/contrib/pageinspect/expected/page.out +++ b/contrib/pageinspect/expected/page.out @@ -265,9 +265,9 @@ SELECT fsm_page_contents(decode(repeat('00', :block_size), 'hex')); (1 row) SELECT page_header(decode(repeat('00', :block_size), 'hex')); - page_header ------------------------ - (0/0,0,0,0,0,0,0,0,0) + page_header +------------------------------ + (0/00000000,0,0,0,0,0,0,0,0) (1 row) SELECT page_checksum(decode(repeat('00', :block_size), 'hex'), 1); diff --git a/contrib/pageinspect/rawpage.c b/contrib/pageinspect/rawpage.c index 0d57123aa2669..aef442b5db30a 100644 --- a/contrib/pageinspect/rawpage.c +++ b/contrib/pageinspect/rawpage.c @@ -282,7 +282,7 @@ page_header(PG_FUNCTION_ARGS) { char lsnchar[64]; - snprintf(lsnchar, sizeof(lsnchar), "%X/%X", LSN_FORMAT_ARGS(lsn)); + snprintf(lsnchar, sizeof(lsnchar), "%X/%08X", LSN_FORMAT_ARGS(lsn)); values[0] = CStringGetTextDatum(lsnchar); } else diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index 4b007f6e1b06a..ae0291e6e96df 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -320,7 +320,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) uint64 os_page_count; int pages_per_buffer; int max_entries; - volatile uint64 touch pg_attribute_unused(); char *startptr, *endptr; @@ -375,7 +374,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) /* Only need to touch memory once per backend process lifetime */ if (firstNumaTouch) - pg_numa_touch_mem_if_required(touch, ptr); + pg_numa_touch_mem_if_required(ptr); } Assert(idx == os_page_count); diff --git a/contrib/pg_stat_statements/Makefile b/contrib/pg_stat_statements/Makefile index b2bd8794d2a14..fe0478ac55266 100644 --- a/contrib/pg_stat_statements/Makefile +++ b/contrib/pg_stat_statements/Makefile @@ -7,6 +7,7 @@ OBJS = \ EXTENSION = pg_stat_statements DATA = pg_stat_statements--1.4.sql \ + pg_stat_statements--1.12--1.13.sql \ pg_stat_statements--1.11--1.12.sql pg_stat_statements--1.10--1.11.sql \ pg_stat_statements--1.9--1.10.sql pg_stat_statements--1.8--1.9.sql \ pg_stat_statements--1.7--1.8.sql pg_stat_statements--1.6--1.7.sql \ @@ -20,7 +21,7 @@ LDFLAGS_SL += $(filter -lm, $(LIBS)) REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/pg_stat_statements/pg_stat_statements.conf REGRESS = select dml cursors utility level_tracking planning \ user_activity wal entry_timestamp privileges extended \ - parallel cleanup oldextversions squashing + parallel plancache cleanup oldextversions squashing # Disabled because these tests require "shared_preload_libraries=pg_stat_statements", # which typical installcheck users do not have (e.g. buildfarm clients). NO_INSTALLCHECK = 1 diff --git a/contrib/pg_stat_statements/expected/cursors.out b/contrib/pg_stat_statements/expected/cursors.out index 0fc4b2c098d0e..6afb48ace9220 100644 --- a/contrib/pg_stat_statements/expected/cursors.out +++ b/contrib/pg_stat_statements/expected/cursors.out @@ -57,8 +57,8 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; 1 | 0 | COMMIT 1 | 0 | DECLARE cursor_stats_1 CURSOR WITH HOLD FOR SELECT $1 1 | 0 | DECLARE cursor_stats_2 CURSOR WITH HOLD FOR SELECT $1 - 1 | 1 | FETCH 1 IN cursor_stats_1 - 1 | 1 | FETCH 1 IN cursor_stats_2 + 1 | 1 | FETCH $1 IN cursor_stats_1 + 1 | 1 | FETCH $1 IN cursor_stats_2 1 | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t (9 rows) @@ -68,3 +68,140 @@ SELECT pg_stat_statements_reset() IS NOT NULL AS t; t (1 row) +-- Normalization of FETCH statements +BEGIN; +DECLARE pgss_cursor CURSOR FOR SELECT FROM generate_series(1, 10); +-- implicit directions +FETCH pgss_cursor; +-- +(1 row) + +FETCH 1 pgss_cursor; +-- +(1 row) + +FETCH 2 pgss_cursor; +-- +(2 rows) + +FETCH -1 pgss_cursor; +-- +(1 row) + +-- explicit NEXT +FETCH NEXT pgss_cursor; +-- +(1 row) + +-- explicit PRIOR +FETCH PRIOR pgss_cursor; +-- +(1 row) + +-- explicit FIRST +FETCH FIRST pgss_cursor; +-- +(1 row) + +-- explicit LAST +FETCH LAST pgss_cursor; +-- +(1 row) + +-- explicit ABSOLUTE +FETCH ABSOLUTE 1 pgss_cursor; +-- +(1 row) + +FETCH ABSOLUTE 2 pgss_cursor; +-- +(1 row) + +FETCH ABSOLUTE -1 pgss_cursor; +-- +(1 row) + +-- explicit RELATIVE +FETCH RELATIVE 1 pgss_cursor; +-- +(0 rows) + +FETCH RELATIVE 2 pgss_cursor; +-- +(0 rows) + +FETCH RELATIVE -1 pgss_cursor; +-- +(1 row) + +-- explicit FORWARD +FETCH ALL pgss_cursor; +-- +(0 rows) + +-- explicit FORWARD ALL +FETCH FORWARD ALL pgss_cursor; +-- +(0 rows) + +-- explicit FETCH FORWARD +FETCH FORWARD pgss_cursor; +-- +(0 rows) + +FETCH FORWARD 1 pgss_cursor; +-- +(0 rows) + +FETCH FORWARD 2 pgss_cursor; +-- +(0 rows) + +FETCH FORWARD -1 pgss_cursor; +-- +(1 row) + +-- explicit FETCH BACKWARD +FETCH BACKWARD pgss_cursor; +-- +(1 row) + +FETCH BACKWARD 1 pgss_cursor; +-- +(1 row) + +FETCH BACKWARD 2 pgss_cursor; +-- +(2 rows) + +FETCH BACKWARD -1 pgss_cursor; +-- +(1 row) + +-- explicit BACKWARD ALL +FETCH BACKWARD ALL pgss_cursor; +-- +(6 rows) + +COMMIT; +SELECT calls, query FROM pg_stat_statements ORDER BY query COLLATE "C"; + calls | query +-------+-------------------------------------------------------------------- + 1 | BEGIN + 1 | COMMIT + 1 | DECLARE pgss_cursor CURSOR FOR SELECT FROM generate_series($1, $2) + 3 | FETCH ABSOLUTE $1 pgss_cursor + 1 | FETCH ALL pgss_cursor + 1 | FETCH BACKWARD ALL pgss_cursor + 4 | FETCH BACKWARD pgss_cursor + 1 | FETCH FIRST pgss_cursor + 1 | FETCH FORWARD ALL pgss_cursor + 4 | FETCH FORWARD pgss_cursor + 1 | FETCH LAST pgss_cursor + 1 | FETCH NEXT pgss_cursor + 1 | FETCH PRIOR pgss_cursor + 3 | FETCH RELATIVE $1 pgss_cursor + 4 | FETCH pgss_cursor + 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t +(16 rows) + diff --git a/contrib/pg_stat_statements/expected/level_tracking.out b/contrib/pg_stat_statements/expected/level_tracking.out index 8213fcd2e612c..8e8388dd5cb1f 100644 --- a/contrib/pg_stat_statements/expected/level_tracking.out +++ b/contrib/pg_stat_statements/expected/level_tracking.out @@ -1147,7 +1147,7 @@ SELECT toplevel, calls, query FROM pg_stat_statements t | 1 | COMMIT t | 1 | DECLARE FOOCUR CURSOR FOR SELECT * from stats_track_tab f | 1 | DECLARE FOOCUR CURSOR FOR SELECT * from stats_track_tab; - t | 1 | FETCH FORWARD 1 FROM foocur + t | 1 | FETCH FORWARD $1 FROM foocur t | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t (7 rows) @@ -1176,7 +1176,7 @@ SELECT toplevel, calls, query FROM pg_stat_statements t | 1 | CLOSE foocur t | 1 | COMMIT t | 1 | DECLARE FOOCUR CURSOR FOR SELECT * FROM stats_track_tab - t | 1 | FETCH FORWARD 1 FROM foocur + t | 1 | FETCH FORWARD $1 FROM foocur t | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t (6 rows) diff --git a/contrib/pg_stat_statements/expected/oldextversions.out b/contrib/pg_stat_statements/expected/oldextversions.out index de679b19711ab..726383a99d7c1 100644 --- a/contrib/pg_stat_statements/expected/oldextversions.out +++ b/contrib/pg_stat_statements/expected/oldextversions.out @@ -407,4 +407,71 @@ SELECT count(*) > 0 AS has_data FROM pg_stat_statements; t (1 row) +-- New functions and views for pg_stat_statements in 1.13 +AlTER EXTENSION pg_stat_statements UPDATE TO '1.13'; +\d pg_stat_statements + View "public.pg_stat_statements" + Column | Type | Collation | Nullable | Default +----------------------------+--------------------------+-----------+----------+--------- + userid | oid | | | + dbid | oid | | | + toplevel | boolean | | | + queryid | bigint | | | + query | text | | | + plans | bigint | | | + total_plan_time | double precision | | | + min_plan_time | double precision | | | + max_plan_time | double precision | | | + mean_plan_time | double precision | | | + stddev_plan_time | double precision | | | + calls | bigint | | | + total_exec_time | double precision | | | + min_exec_time | double precision | | | + max_exec_time | double precision | | | + mean_exec_time | double precision | | | + stddev_exec_time | double precision | | | + rows | bigint | | | + shared_blks_hit | bigint | | | + shared_blks_read | bigint | | | + shared_blks_dirtied | bigint | | | + shared_blks_written | bigint | | | + local_blks_hit | bigint | | | + local_blks_read | bigint | | | + local_blks_dirtied | bigint | | | + local_blks_written | bigint | | | + temp_blks_read | bigint | | | + temp_blks_written | bigint | | | + shared_blk_read_time | double precision | | | + shared_blk_write_time | double precision | | | + local_blk_read_time | double precision | | | + local_blk_write_time | double precision | | | + temp_blk_read_time | double precision | | | + temp_blk_write_time | double precision | | | + wal_records | bigint | | | + wal_fpi | bigint | | | + wal_bytes | numeric | | | + wal_buffers_full | bigint | | | + jit_functions | bigint | | | + jit_generation_time | double precision | | | + jit_inlining_count | bigint | | | + jit_inlining_time | double precision | | | + jit_optimization_count | bigint | | | + jit_optimization_time | double precision | | | + jit_emission_count | bigint | | | + jit_emission_time | double precision | | | + jit_deform_count | bigint | | | + jit_deform_time | double precision | | | + parallel_workers_to_launch | bigint | | | + parallel_workers_launched | bigint | | | + generic_plan_calls | bigint | | | + custom_plan_calls | bigint | | | + stats_since | timestamp with time zone | | | + minmax_stats_since | timestamp with time zone | | | + +SELECT count(*) > 0 AS has_data FROM pg_stat_statements; + has_data +---------- + t +(1 row) + DROP EXTENSION pg_stat_statements; diff --git a/contrib/pg_stat_statements/expected/plancache.out b/contrib/pg_stat_statements/expected/plancache.out new file mode 100644 index 0000000000000..e152de9f55130 --- /dev/null +++ b/contrib/pg_stat_statements/expected/plancache.out @@ -0,0 +1,224 @@ +-- +-- Tests with plan cache +-- +-- Setup +CREATE OR REPLACE FUNCTION select_one_func(int) RETURNS VOID AS $$ +DECLARE + ret INT; +BEGIN + SELECT $1 INTO ret; +END; +$$ LANGUAGE plpgsql; +CREATE OR REPLACE PROCEDURE select_one_proc(int) AS $$ +DECLARE + ret INT; +BEGIN + SELECT $1 INTO ret; +END; +$$ LANGUAGE plpgsql; +-- Prepared statements +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +PREPARE p1 AS SELECT $1 AS a; +SET plan_cache_mode TO force_generic_plan; +EXECUTE p1(1); + a +--- + 1 +(1 row) + +SET plan_cache_mode TO force_custom_plan; +EXECUTE p1(1); + a +--- + 1 +(1 row) + +SELECT calls, generic_plan_calls, custom_plan_calls, query FROM pg_stat_statements + ORDER BY query COLLATE "C"; + calls | generic_plan_calls | custom_plan_calls | query +-------+--------------------+-------------------+---------------------------------------------------- + 2 | 1 | 1 | PREPARE p1 AS SELECT $1 AS a + 1 | 0 | 0 | SELECT pg_stat_statements_reset() IS NOT NULL AS t + 2 | 0 | 0 | SET plan_cache_mode TO $1 +(3 rows) + +DEALLOCATE p1; +-- Extended query protocol +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT $1 AS a \parse p1 +SET plan_cache_mode TO force_generic_plan; +\bind_named p1 1 +; + a +--- + 1 +(1 row) + +SET plan_cache_mode TO force_custom_plan; +\bind_named p1 1 +; + a +--- + 1 +(1 row) + +SELECT calls, generic_plan_calls, custom_plan_calls, query FROM pg_stat_statements + ORDER BY query COLLATE "C"; + calls | generic_plan_calls | custom_plan_calls | query +-------+--------------------+-------------------+---------------------------------------------------- + 2 | 1 | 1 | SELECT $1 AS a + 1 | 0 | 0 | SELECT pg_stat_statements_reset() IS NOT NULL AS t + 2 | 0 | 0 | SET plan_cache_mode TO $1 +(3 rows) + +\close_prepared p1 +-- EXPLAIN [ANALYZE] EXECUTE +SET pg_stat_statements.track = 'all'; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +PREPARE p1 AS SELECT $1; +SET plan_cache_mode TO force_generic_plan; +EXPLAIN (COSTS OFF) EXECUTE p1(1); + QUERY PLAN +------------ + Result +(1 row) + +EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) EXECUTE p1(1); + QUERY PLAN +----------------------------------- + Result (actual rows=1.00 loops=1) +(1 row) + +SET plan_cache_mode TO force_custom_plan; +EXPLAIN (COSTS OFF) EXECUTE p1(1); + QUERY PLAN +------------ + Result +(1 row) + +EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) EXECUTE p1(1); + QUERY PLAN +----------------------------------- + Result (actual rows=1.00 loops=1) +(1 row) + +SELECT calls, generic_plan_calls, custom_plan_calls, toplevel, query FROM pg_stat_statements + ORDER BY query COLLATE "C"; + calls | generic_plan_calls | custom_plan_calls | toplevel | query +-------+--------------------+-------------------+----------+---------------------------------------------------------------------------------- + 2 | 0 | 0 | t | EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) EXECUTE p1(1) + 2 | 0 | 0 | t | EXPLAIN (COSTS OFF) EXECUTE p1(1) + 4 | 2 | 2 | f | PREPARE p1 AS SELECT $1 + 1 | 0 | 0 | t | SELECT pg_stat_statements_reset() IS NOT NULL AS t + 2 | 0 | 0 | t | SET plan_cache_mode TO $1 +(5 rows) + +RESET pg_stat_statements.track; +DEALLOCATE p1; +-- Functions/procedures +SET pg_stat_statements.track = 'all'; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SET plan_cache_mode TO force_generic_plan; +SELECT select_one_func(1); + select_one_func +----------------- + +(1 row) + +CALL select_one_proc(1); +SET plan_cache_mode TO force_custom_plan; +SELECT select_one_func(1); + select_one_func +----------------- + +(1 row) + +CALL select_one_proc(1); +SELECT calls, generic_plan_calls, custom_plan_calls, toplevel, query FROM pg_stat_statements + ORDER BY query COLLATE "C"; + calls | generic_plan_calls | custom_plan_calls | toplevel | query +-------+--------------------+-------------------+----------+---------------------------------------------------- + 2 | 0 | 0 | t | CALL select_one_proc($1) + 4 | 2 | 2 | f | SELECT $1 + 1 | 0 | 0 | t | SELECT pg_stat_statements_reset() IS NOT NULL AS t + 2 | 0 | 0 | t | SELECT select_one_func($1) + 2 | 0 | 0 | t | SET plan_cache_mode TO $1 +(5 rows) + +-- +-- EXPLAIN [ANALYZE] EXECUTE + functions/procedures +-- +SET pg_stat_statements.track = 'all'; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SET plan_cache_mode TO force_generic_plan; +EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) SELECT select_one_func(1); + QUERY PLAN +----------------------------------- + Result (actual rows=1.00 loops=1) +(1 row) + +EXPLAIN (COSTS OFF) SELECT select_one_func(1); + QUERY PLAN +------------ + Result +(1 row) + +CALL select_one_proc(1); +SET plan_cache_mode TO force_custom_plan; +EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) SELECT select_one_func(1); + QUERY PLAN +----------------------------------- + Result (actual rows=1.00 loops=1) +(1 row) + +EXPLAIN (COSTS OFF) SELECT select_one_func(1); + QUERY PLAN +------------ + Result +(1 row) + +CALL select_one_proc(1); +SELECT calls, generic_plan_calls, custom_plan_calls, toplevel, query FROM pg_stat_statements + ORDER BY query COLLATE "C", toplevel; + calls | generic_plan_calls | custom_plan_calls | toplevel | query +-------+--------------------+-------------------+----------+------------------------------------------------------------------------------------------------ + 2 | 0 | 0 | t | CALL select_one_proc($1) + 2 | 0 | 0 | t | EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) SELECT select_one_func($1) + 4 | 0 | 0 | f | EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) SELECT select_one_func($1); + 2 | 0 | 0 | t | EXPLAIN (COSTS OFF) SELECT select_one_func($1) + 4 | 2 | 2 | f | SELECT $1 + 1 | 0 | 0 | t | SELECT pg_stat_statements_reset() IS NOT NULL AS t + 2 | 0 | 0 | t | SET plan_cache_mode TO $1 +(7 rows) + +RESET pg_stat_statements.track; +-- +-- Cleanup +-- +DROP FUNCTION select_one_func(int); +DROP PROCEDURE select_one_proc(int); diff --git a/contrib/pg_stat_statements/expected/utility.out b/contrib/pg_stat_statements/expected/utility.out index 060d4416dd749..e4d6564ea5b5a 100644 --- a/contrib/pg_stat_statements/expected/utility.out +++ b/contrib/pg_stat_statements/expected/utility.out @@ -702,7 +702,7 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; 1 | 13 | CREATE MATERIALIZED VIEW pgss_matv AS SELECT * FROM pgss_ctas 1 | 10 | CREATE TABLE pgss_ctas AS SELECT a, $1 b FROM generate_series($2, $3) a 1 | 0 | DECLARE pgss_cursor CURSOR FOR SELECT * FROM pgss_matv - 1 | 5 | FETCH FORWARD 5 pgss_cursor + 1 | 5 | FETCH FORWARD $1 pgss_cursor 1 | 7 | FETCH FORWARD ALL pgss_cursor 1 | 1 | FETCH NEXT pgss_cursor 1 | 13 | REFRESH MATERIALIZED VIEW pgss_matv diff --git a/contrib/pg_stat_statements/meson.build b/contrib/pg_stat_statements/meson.build index 01a6cbdcf6139..7b8bfbb1de78c 100644 --- a/contrib/pg_stat_statements/meson.build +++ b/contrib/pg_stat_statements/meson.build @@ -21,6 +21,7 @@ contrib_targets += pg_stat_statements install_data( 'pg_stat_statements.control', 'pg_stat_statements--1.4.sql', + 'pg_stat_statements--1.12--1.13.sql', 'pg_stat_statements--1.11--1.12.sql', 'pg_stat_statements--1.10--1.11.sql', 'pg_stat_statements--1.9--1.10.sql', @@ -54,6 +55,7 @@ tests += { 'privileges', 'extended', 'parallel', + 'plancache', 'cleanup', 'oldextversions', 'squashing', diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.12--1.13.sql b/contrib/pg_stat_statements/pg_stat_statements--1.12--1.13.sql new file mode 100644 index 0000000000000..2f0eaf14ec34d --- /dev/null +++ b/contrib/pg_stat_statements/pg_stat_statements--1.12--1.13.sql @@ -0,0 +1,78 @@ +/* contrib/pg_stat_statements/pg_stat_statements--1.12--1.13.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION pg_stat_statements UPDATE TO '1.13'" to load this file. \quit + +/* First we have to remove them from the extension */ +ALTER EXTENSION pg_stat_statements DROP VIEW pg_stat_statements; +ALTER EXTENSION pg_stat_statements DROP FUNCTION pg_stat_statements(boolean); + +/* Then we can drop them */ +DROP VIEW pg_stat_statements; +DROP FUNCTION pg_stat_statements(boolean); + +/* Now redefine */ +CREATE FUNCTION pg_stat_statements(IN showtext boolean, + OUT userid oid, + OUT dbid oid, + OUT toplevel bool, + OUT queryid bigint, + OUT query text, + OUT plans int8, + OUT total_plan_time float8, + OUT min_plan_time float8, + OUT max_plan_time float8, + OUT mean_plan_time float8, + OUT stddev_plan_time float8, + OUT calls int8, + OUT total_exec_time float8, + OUT min_exec_time float8, + OUT max_exec_time float8, + OUT mean_exec_time float8, + OUT stddev_exec_time float8, + OUT rows int8, + OUT shared_blks_hit int8, + OUT shared_blks_read int8, + OUT shared_blks_dirtied int8, + OUT shared_blks_written int8, + OUT local_blks_hit int8, + OUT local_blks_read int8, + OUT local_blks_dirtied int8, + OUT local_blks_written int8, + OUT temp_blks_read int8, + OUT temp_blks_written int8, + OUT shared_blk_read_time float8, + OUT shared_blk_write_time float8, + OUT local_blk_read_time float8, + OUT local_blk_write_time float8, + OUT temp_blk_read_time float8, + OUT temp_blk_write_time float8, + OUT wal_records int8, + OUT wal_fpi int8, + OUT wal_bytes numeric, + OUT wal_buffers_full int8, + OUT jit_functions int8, + OUT jit_generation_time float8, + OUT jit_inlining_count int8, + OUT jit_inlining_time float8, + OUT jit_optimization_count int8, + OUT jit_optimization_time float8, + OUT jit_emission_count int8, + OUT jit_emission_time float8, + OUT jit_deform_count int8, + OUT jit_deform_time float8, + OUT parallel_workers_to_launch int8, + OUT parallel_workers_launched int8, + OUT generic_plan_calls int8, + OUT custom_plan_calls int8, + OUT stats_since timestamp with time zone, + OUT minmax_stats_since timestamp with time zone +) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pg_stat_statements_1_13' +LANGUAGE C STRICT VOLATILE PARALLEL SAFE; + +CREATE VIEW pg_stat_statements AS + SELECT * FROM pg_stat_statements(true); + +GRANT SELECT ON pg_stat_statements TO PUBLIC; diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index e7857f81ec057..9fc9635d3300d 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -85,7 +85,7 @@ PG_MODULE_MAGIC_EXT( #define PGSS_TEXT_FILE PG_STAT_TMP_DIR "/pgss_query_texts.stat" /* Magic number identifying the stats file format */ -static const uint32 PGSS_FILE_HEADER = 0x20220408; +static const uint32 PGSS_FILE_HEADER = 0x20250731; /* PostgreSQL major version number, changes in which invalidate all entries */ static const uint32 PGSS_PG_MAJOR_VERSION = PG_VERSION_NUM / 100; @@ -114,6 +114,7 @@ typedef enum pgssVersion PGSS_V1_10, PGSS_V1_11, PGSS_V1_12, + PGSS_V1_13, } pgssVersion; typedef enum pgssStoreKind @@ -210,6 +211,8 @@ typedef struct Counters * to be launched */ int64 parallel_workers_launched; /* # of parallel workers actually * launched */ + int64 generic_plan_calls; /* number of calls using a generic plan */ + int64 custom_plan_calls; /* number of calls using a custom plan */ } Counters; /* @@ -323,6 +326,7 @@ PG_FUNCTION_INFO_V1(pg_stat_statements_1_9); PG_FUNCTION_INFO_V1(pg_stat_statements_1_10); PG_FUNCTION_INFO_V1(pg_stat_statements_1_11); PG_FUNCTION_INFO_V1(pg_stat_statements_1_12); +PG_FUNCTION_INFO_V1(pg_stat_statements_1_13); PG_FUNCTION_INFO_V1(pg_stat_statements); PG_FUNCTION_INFO_V1(pg_stat_statements_info); @@ -355,7 +359,8 @@ static void pgss_store(const char *query, int64 queryId, const struct JitInstrumentation *jitusage, JumbleState *jstate, int parallel_workers_to_launch, - int parallel_workers_launched); + int parallel_workers_launched, + PlannedStmtOrigin planOrigin); static void pg_stat_statements_internal(FunctionCallInfo fcinfo, pgssVersion api_version, bool showtext); @@ -877,7 +882,8 @@ pgss_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate) NULL, jstate, 0, - 0); + 0, + PLAN_STMT_UNKNOWN); } /* @@ -957,7 +963,8 @@ pgss_planner(Query *parse, NULL, NULL, 0, - 0); + 0, + result->planOrigin); } else { @@ -1091,7 +1098,8 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL, NULL, queryDesc->estate->es_parallel_workers_to_launch, - queryDesc->estate->es_parallel_workers_launched); + queryDesc->estate->es_parallel_workers_launched, + queryDesc->plannedstmt->planOrigin); } if (prev_ExecutorEnd) @@ -1224,7 +1232,8 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, NULL, NULL, 0, - 0); + 0, + pstmt->planOrigin); } else { @@ -1287,7 +1296,8 @@ pgss_store(const char *query, int64 queryId, const struct JitInstrumentation *jitusage, JumbleState *jstate, int parallel_workers_to_launch, - int parallel_workers_launched) + int parallel_workers_launched, + PlannedStmtOrigin planOrigin) { pgssHashKey key; pgssEntry *entry; @@ -1495,6 +1505,12 @@ pgss_store(const char *query, int64 queryId, entry->counters.parallel_workers_to_launch += parallel_workers_to_launch; entry->counters.parallel_workers_launched += parallel_workers_launched; + /* plan cache counters */ + if (planOrigin == PLAN_STMT_CACHE_GENERIC) + entry->counters.generic_plan_calls++; + else if (planOrigin == PLAN_STMT_CACHE_CUSTOM) + entry->counters.custom_plan_calls++; + SpinLockRelease(&entry->mutex); } @@ -1562,7 +1578,8 @@ pg_stat_statements_reset(PG_FUNCTION_ARGS) #define PG_STAT_STATEMENTS_COLS_V1_10 43 #define PG_STAT_STATEMENTS_COLS_V1_11 49 #define PG_STAT_STATEMENTS_COLS_V1_12 52 -#define PG_STAT_STATEMENTS_COLS 52 /* maximum of above */ +#define PG_STAT_STATEMENTS_COLS_V1_13 54 +#define PG_STAT_STATEMENTS_COLS 54 /* maximum of above */ /* * Retrieve statement statistics. @@ -1574,6 +1591,16 @@ pg_stat_statements_reset(PG_FUNCTION_ARGS) * expected API version is identified by embedding it in the C name of the * function. Unfortunately we weren't bright enough to do that for 1.1. */ +Datum +pg_stat_statements_1_13(PG_FUNCTION_ARGS) +{ + bool showtext = PG_GETARG_BOOL(0); + + pg_stat_statements_internal(fcinfo, PGSS_V1_13, showtext); + + return (Datum) 0; +} + Datum pg_stat_statements_1_12(PG_FUNCTION_ARGS) { @@ -1732,6 +1759,10 @@ pg_stat_statements_internal(FunctionCallInfo fcinfo, if (api_version != PGSS_V1_12) elog(ERROR, "incorrect number of output arguments"); break; + case PG_STAT_STATEMENTS_COLS_V1_13: + if (api_version != PGSS_V1_13) + elog(ERROR, "incorrect number of output arguments"); + break; default: elog(ERROR, "incorrect number of output arguments"); } @@ -1984,6 +2015,11 @@ pg_stat_statements_internal(FunctionCallInfo fcinfo, values[i++] = Int64GetDatumFast(tmp.parallel_workers_to_launch); values[i++] = Int64GetDatumFast(tmp.parallel_workers_launched); } + if (api_version >= PGSS_V1_13) + { + values[i++] = Int64GetDatumFast(tmp.generic_plan_calls); + values[i++] = Int64GetDatumFast(tmp.custom_plan_calls); + } if (api_version >= PGSS_V1_11) { values[i++] = TimestampTzGetDatum(stats_since); @@ -1999,6 +2035,7 @@ pg_stat_statements_internal(FunctionCallInfo fcinfo, api_version == PGSS_V1_10 ? PG_STAT_STATEMENTS_COLS_V1_10 : api_version == PGSS_V1_11 ? PG_STAT_STATEMENTS_COLS_V1_11 : api_version == PGSS_V1_12 ? PG_STAT_STATEMENTS_COLS_V1_12 : + api_version == PGSS_V1_13 ? PG_STAT_STATEMENTS_COLS_V1_13 : -1 /* fail if you forget to update this assert */ )); tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); diff --git a/contrib/pg_stat_statements/pg_stat_statements.control b/contrib/pg_stat_statements/pg_stat_statements.control index d45ebc12e3605..2eee0ceffa894 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.control +++ b/contrib/pg_stat_statements/pg_stat_statements.control @@ -1,5 +1,5 @@ # pg_stat_statements extension comment = 'track planning and execution statistics of all SQL statements executed' -default_version = '1.12' +default_version = '1.13' module_pathname = '$libdir/pg_stat_statements' relocatable = true diff --git a/contrib/pg_stat_statements/sql/cursors.sql b/contrib/pg_stat_statements/sql/cursors.sql index 61738ac470e82..78bb42284331f 100644 --- a/contrib/pg_stat_statements/sql/cursors.sql +++ b/contrib/pg_stat_statements/sql/cursors.sql @@ -28,3 +28,46 @@ COMMIT; SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; SELECT pg_stat_statements_reset() IS NOT NULL AS t; + +-- Normalization of FETCH statements +BEGIN; +DECLARE pgss_cursor CURSOR FOR SELECT FROM generate_series(1, 10); +-- implicit directions +FETCH pgss_cursor; +FETCH 1 pgss_cursor; +FETCH 2 pgss_cursor; +FETCH -1 pgss_cursor; +-- explicit NEXT +FETCH NEXT pgss_cursor; +-- explicit PRIOR +FETCH PRIOR pgss_cursor; +-- explicit FIRST +FETCH FIRST pgss_cursor; +-- explicit LAST +FETCH LAST pgss_cursor; +-- explicit ABSOLUTE +FETCH ABSOLUTE 1 pgss_cursor; +FETCH ABSOLUTE 2 pgss_cursor; +FETCH ABSOLUTE -1 pgss_cursor; +-- explicit RELATIVE +FETCH RELATIVE 1 pgss_cursor; +FETCH RELATIVE 2 pgss_cursor; +FETCH RELATIVE -1 pgss_cursor; +-- explicit FORWARD +FETCH ALL pgss_cursor; +-- explicit FORWARD ALL +FETCH FORWARD ALL pgss_cursor; +-- explicit FETCH FORWARD +FETCH FORWARD pgss_cursor; +FETCH FORWARD 1 pgss_cursor; +FETCH FORWARD 2 pgss_cursor; +FETCH FORWARD -1 pgss_cursor; +-- explicit FETCH BACKWARD +FETCH BACKWARD pgss_cursor; +FETCH BACKWARD 1 pgss_cursor; +FETCH BACKWARD 2 pgss_cursor; +FETCH BACKWARD -1 pgss_cursor; +-- explicit BACKWARD ALL +FETCH BACKWARD ALL pgss_cursor; +COMMIT; +SELECT calls, query FROM pg_stat_statements ORDER BY query COLLATE "C"; diff --git a/contrib/pg_stat_statements/sql/oldextversions.sql b/contrib/pg_stat_statements/sql/oldextversions.sql index 13b8ca28586d1..e416efe9ffbee 100644 --- a/contrib/pg_stat_statements/sql/oldextversions.sql +++ b/contrib/pg_stat_statements/sql/oldextversions.sql @@ -63,4 +63,9 @@ AlTER EXTENSION pg_stat_statements UPDATE TO '1.12'; \d pg_stat_statements SELECT count(*) > 0 AS has_data FROM pg_stat_statements; +-- New functions and views for pg_stat_statements in 1.13 +AlTER EXTENSION pg_stat_statements UPDATE TO '1.13'; +\d pg_stat_statements +SELECT count(*) > 0 AS has_data FROM pg_stat_statements; + DROP EXTENSION pg_stat_statements; diff --git a/contrib/pg_stat_statements/sql/plancache.sql b/contrib/pg_stat_statements/sql/plancache.sql new file mode 100644 index 0000000000000..160ced7add368 --- /dev/null +++ b/contrib/pg_stat_statements/sql/plancache.sql @@ -0,0 +1,94 @@ +-- +-- Tests with plan cache +-- + +-- Setup +CREATE OR REPLACE FUNCTION select_one_func(int) RETURNS VOID AS $$ +DECLARE + ret INT; +BEGIN + SELECT $1 INTO ret; +END; +$$ LANGUAGE plpgsql; +CREATE OR REPLACE PROCEDURE select_one_proc(int) AS $$ +DECLARE + ret INT; +BEGIN + SELECT $1 INTO ret; +END; +$$ LANGUAGE plpgsql; + +-- Prepared statements +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +PREPARE p1 AS SELECT $1 AS a; +SET plan_cache_mode TO force_generic_plan; +EXECUTE p1(1); +SET plan_cache_mode TO force_custom_plan; +EXECUTE p1(1); +SELECT calls, generic_plan_calls, custom_plan_calls, query FROM pg_stat_statements + ORDER BY query COLLATE "C"; +DEALLOCATE p1; + +-- Extended query protocol +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SELECT $1 AS a \parse p1 +SET plan_cache_mode TO force_generic_plan; +\bind_named p1 1 +; +SET plan_cache_mode TO force_custom_plan; +\bind_named p1 1 +; +SELECT calls, generic_plan_calls, custom_plan_calls, query FROM pg_stat_statements + ORDER BY query COLLATE "C"; +\close_prepared p1 + +-- EXPLAIN [ANALYZE] EXECUTE +SET pg_stat_statements.track = 'all'; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +PREPARE p1 AS SELECT $1; +SET plan_cache_mode TO force_generic_plan; +EXPLAIN (COSTS OFF) EXECUTE p1(1); +EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) EXECUTE p1(1); +SET plan_cache_mode TO force_custom_plan; +EXPLAIN (COSTS OFF) EXECUTE p1(1); +EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) EXECUTE p1(1); +SELECT calls, generic_plan_calls, custom_plan_calls, toplevel, query FROM pg_stat_statements + ORDER BY query COLLATE "C"; +RESET pg_stat_statements.track; +DEALLOCATE p1; + +-- Functions/procedures +SET pg_stat_statements.track = 'all'; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SET plan_cache_mode TO force_generic_plan; +SELECT select_one_func(1); +CALL select_one_proc(1); +SET plan_cache_mode TO force_custom_plan; +SELECT select_one_func(1); +CALL select_one_proc(1); +SELECT calls, generic_plan_calls, custom_plan_calls, toplevel, query FROM pg_stat_statements + ORDER BY query COLLATE "C"; + +-- +-- EXPLAIN [ANALYZE] EXECUTE + functions/procedures +-- +SET pg_stat_statements.track = 'all'; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SET plan_cache_mode TO force_generic_plan; +EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) SELECT select_one_func(1); +EXPLAIN (COSTS OFF) SELECT select_one_func(1); +CALL select_one_proc(1); +SET plan_cache_mode TO force_custom_plan; +EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF) SELECT select_one_func(1); +EXPLAIN (COSTS OFF) SELECT select_one_func(1); +CALL select_one_proc(1); +SELECT calls, generic_plan_calls, custom_plan_calls, toplevel, query FROM pg_stat_statements + ORDER BY query COLLATE "C", toplevel; + +RESET pg_stat_statements.track; + +-- +-- Cleanup +-- +DROP FUNCTION select_one_func(int); +DROP PROCEDURE select_one_proc(int); diff --git a/contrib/pg_walinspect/expected/pg_walinspect.out b/contrib/pg_walinspect/expected/pg_walinspect.out index c010eed8c5d6e..f955ff5d3c52a 100644 --- a/contrib/pg_walinspect/expected/pg_walinspect.out +++ b/contrib/pg_walinspect/expected/pg_walinspect.out @@ -19,14 +19,14 @@ INSERT INTO sample_tbl SELECT * FROM generate_series(3, 4); -- =================================================================== -- Invalid input LSN. SELECT * FROM pg_get_wal_record_info('0/0'); -ERROR: could not read WAL at LSN 0/0 +ERROR: could not read WAL at LSN 0/00000000 -- Invalid start LSN. SELECT * FROM pg_get_wal_records_info('0/0', :'wal_lsn1'); -ERROR: could not read WAL at LSN 0/0 +ERROR: could not read WAL at LSN 0/00000000 SELECT * FROM pg_get_wal_stats('0/0', :'wal_lsn1'); -ERROR: could not read WAL at LSN 0/0 +ERROR: could not read WAL at LSN 0/00000000 SELECT * FROM pg_get_wal_block_info('0/0', :'wal_lsn1'); -ERROR: could not read WAL at LSN 0/0 +ERROR: could not read WAL at LSN 0/00000000 -- Start LSN > End LSN. SELECT * FROM pg_get_wal_records_info(:'wal_lsn2', :'wal_lsn1'); ERROR: WAL start LSN must be less than end LSN diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c index 64745564cc249..0398ad82cec90 100644 --- a/contrib/pg_walinspect/pg_walinspect.c +++ b/contrib/pg_walinspect/pg_walinspect.c @@ -105,7 +105,7 @@ InitXLogReaderState(XLogRecPtr lsn) if (lsn < XLOG_BLCKSZ) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("could not read WAL at LSN %X/%X", + errmsg("could not read WAL at LSN %X/%08X", LSN_FORMAT_ARGS(lsn)))); private_data = (ReadLocalXLogPageNoWaitPrivate *) @@ -128,8 +128,8 @@ InitXLogReaderState(XLogRecPtr lsn) if (XLogRecPtrIsInvalid(first_valid_record)) ereport(ERROR, - (errmsg("could not find a valid record after %X/%X", - LSN_FORMAT_ARGS(lsn)))); + errmsg("could not find a valid record after %X/%08X", + LSN_FORMAT_ARGS(lsn))); return xlogreader; } @@ -168,12 +168,12 @@ ReadNextXLogRecord(XLogReaderState *xlogreader) if (errormsg) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not read WAL at %X/%X: %s", + errmsg("could not read WAL at %X/%08X: %s", LSN_FORMAT_ARGS(xlogreader->EndRecPtr), errormsg))); else ereport(ERROR, (errcode_for_file_access(), - errmsg("could not read WAL at %X/%X", + errmsg("could not read WAL at %X/%08X", LSN_FORMAT_ARGS(xlogreader->EndRecPtr)))); } @@ -479,7 +479,7 @@ pg_get_wal_record_info(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("WAL input LSN must be less than current LSN"), - errdetail("Current WAL LSN on the database system is at %X/%X.", + errdetail("Current WAL LSN on the database system is at %X/%08X.", LSN_FORMAT_ARGS(curr_lsn)))); /* Build a tuple descriptor for our result type. */ @@ -491,7 +491,7 @@ pg_get_wal_record_info(PG_FUNCTION_ARGS) if (!ReadNextXLogRecord(xlogreader)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("could not read WAL at %X/%X", + errmsg("could not read WAL at %X/%08X", LSN_FORMAT_ARGS(xlogreader->EndRecPtr)))); GetWALRecordInfo(xlogreader, values, nulls, PG_GET_WAL_RECORD_INFO_COLS); @@ -521,7 +521,7 @@ ValidateInputLSNs(XLogRecPtr start_lsn, XLogRecPtr *end_lsn) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("WAL start LSN must be less than current LSN"), - errdetail("Current WAL LSN on the database system is at %X/%X.", + errdetail("Current WAL LSN on the database system is at %X/%08X.", LSN_FORMAT_ARGS(curr_lsn)))); if (start_lsn > *end_lsn) @@ -827,7 +827,7 @@ pg_get_wal_records_info_till_end_of_wal(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("WAL start LSN must be less than current LSN"), - errdetail("Current WAL LSN on the database system is at %X/%X.", + errdetail("Current WAL LSN on the database system is at %X/%08X.", LSN_FORMAT_ARGS(end_lsn)))); GetWALRecordsInfo(fcinfo, start_lsn, end_lsn); @@ -846,7 +846,7 @@ pg_get_wal_stats_till_end_of_wal(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("WAL start LSN must be less than current LSN"), - errdetail("Current WAL LSN on the database system is at %X/%X.", + errdetail("Current WAL LSN on the database system is at %X/%08X.", LSN_FORMAT_ARGS(end_lsn)))); GetWalStats(fcinfo, start_lsn, end_lsn, stats_per_record); diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c index 304f3c20f8356..e8148f2c5a223 100644 --- a/contrib/postgres_fdw/connection.c +++ b/contrib/postgres_fdw/connection.c @@ -142,6 +142,8 @@ static void do_sql_command_begin(PGconn *conn, const char *sql); static void do_sql_command_end(PGconn *conn, const char *sql, bool consume_input); static void begin_remote_xact(ConnCacheEntry *entry); +static void pgfdw_report_internal(int elevel, PGresult *res, PGconn *conn, + const char *sql); static void pgfdw_xact_callback(XactEvent event, void *arg); static void pgfdw_subxact_callback(SubXactEvent event, SubTransactionId mySubid, @@ -625,6 +627,9 @@ connect_pg_server(ForeignServer *server, UserMapping *user) server->servername), errdetail_internal("%s", pchomp(PQerrorMessage(conn))))); + PQsetNoticeReceiver(conn, libpqsrv_notice_receiver, + "received message via remote connection"); + /* Perform post-connection security checks. */ pgfdw_security_check(keywords, values, user, conn); @@ -812,7 +817,7 @@ static void do_sql_command_begin(PGconn *conn, const char *sql) { if (!PQsendQuery(conn, sql)) - pgfdw_report_error(ERROR, NULL, conn, false, sql); + pgfdw_report_error(NULL, conn, sql); } static void @@ -827,10 +832,10 @@ do_sql_command_end(PGconn *conn, const char *sql, bool consume_input) * would be large compared to the overhead of PQconsumeInput.) */ if (consume_input && !PQconsumeInput(conn)) - pgfdw_report_error(ERROR, NULL, conn, false, sql); + pgfdw_report_error(NULL, conn, sql); res = pgfdw_get_result(conn); if (PQresultStatus(res) != PGRES_COMMAND_OK) - pgfdw_report_error(ERROR, res, conn, true, sql); + pgfdw_report_error(res, conn, sql); PQclear(res); } @@ -963,63 +968,73 @@ pgfdw_get_result(PGconn *conn) /* * Report an error we got from the remote server. * - * elevel: error level to use (typically ERROR, but might be less) - * res: PGresult containing the error + * Callers should use pgfdw_report_error() to throw an error, or use + * pgfdw_report() for lesser message levels. (We make this distinction + * so that pgfdw_report_error() can be marked noreturn.) + * + * res: PGresult containing the error (might be NULL) * conn: connection we did the query on - * clear: if true, PQclear the result (otherwise caller will handle it) * sql: NULL, or text of remote command we tried to execute * + * If "res" is not NULL, it'll be PQclear'ed here (unless we throw error, + * in which case memory context cleanup will clear it eventually). + * * Note: callers that choose not to throw ERROR for a remote error are * responsible for making sure that the associated ConnCacheEntry gets * marked with have_error = true. */ void -pgfdw_report_error(int elevel, PGresult *res, PGconn *conn, - bool clear, const char *sql) +pgfdw_report_error(PGresult *res, PGconn *conn, const char *sql) { - /* If requested, PGresult must be released before leaving this function. */ - PG_TRY(); - { - char *diag_sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE); - char *message_primary = PQresultErrorField(res, PG_DIAG_MESSAGE_PRIMARY); - char *message_detail = PQresultErrorField(res, PG_DIAG_MESSAGE_DETAIL); - char *message_hint = PQresultErrorField(res, PG_DIAG_MESSAGE_HINT); - char *message_context = PQresultErrorField(res, PG_DIAG_CONTEXT); - int sqlstate; - - if (diag_sqlstate) - sqlstate = MAKE_SQLSTATE(diag_sqlstate[0], - diag_sqlstate[1], - diag_sqlstate[2], - diag_sqlstate[3], - diag_sqlstate[4]); - else - sqlstate = ERRCODE_CONNECTION_FAILURE; + pgfdw_report_internal(ERROR, res, conn, sql); + pg_unreachable(); +} - /* - * If we don't get a message from the PGresult, try the PGconn. This - * is needed because for connection-level failures, PQgetResult may - * just return NULL, not a PGresult at all. - */ - if (message_primary == NULL) - message_primary = pchomp(PQerrorMessage(conn)); - - ereport(elevel, - (errcode(sqlstate), - (message_primary != NULL && message_primary[0] != '\0') ? - errmsg_internal("%s", message_primary) : - errmsg("could not obtain message string for remote error"), - message_detail ? errdetail_internal("%s", message_detail) : 0, - message_hint ? errhint("%s", message_hint) : 0, - message_context ? errcontext("%s", message_context) : 0, - sql ? errcontext("remote SQL command: %s", sql) : 0)); - } - PG_FINALLY(); - { - if (clear) - PQclear(res); - } - PG_END_TRY(); +void +pgfdw_report(int elevel, PGresult *res, PGconn *conn, const char *sql) +{ + Assert(elevel < ERROR); /* use pgfdw_report_error for that */ + pgfdw_report_internal(elevel, res, conn, sql); +} + +static void +pgfdw_report_internal(int elevel, PGresult *res, PGconn *conn, + const char *sql) +{ + char *diag_sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE); + char *message_primary = PQresultErrorField(res, PG_DIAG_MESSAGE_PRIMARY); + char *message_detail = PQresultErrorField(res, PG_DIAG_MESSAGE_DETAIL); + char *message_hint = PQresultErrorField(res, PG_DIAG_MESSAGE_HINT); + char *message_context = PQresultErrorField(res, PG_DIAG_CONTEXT); + int sqlstate; + + if (diag_sqlstate) + sqlstate = MAKE_SQLSTATE(diag_sqlstate[0], + diag_sqlstate[1], + diag_sqlstate[2], + diag_sqlstate[3], + diag_sqlstate[4]); + else + sqlstate = ERRCODE_CONNECTION_FAILURE; + + /* + * If we don't get a message from the PGresult, try the PGconn. This is + * needed because for connection-level failures, PQgetResult may just + * return NULL, not a PGresult at all. + */ + if (message_primary == NULL) + message_primary = pchomp(PQerrorMessage(conn)); + + ereport(elevel, + (errcode(sqlstate), + (message_primary != NULL && message_primary[0] != '\0') ? + errmsg_internal("%s", message_primary) : + errmsg("could not obtain message string for remote error"), + message_detail ? errdetail_internal("%s", message_detail) : 0, + message_hint ? errhint("%s", message_hint) : 0, + message_context ? errcontext("%s", message_context) : 0, + sql ? errcontext("remote SQL command: %s", sql) : 0)); + PQclear(res); } /* @@ -1542,7 +1557,7 @@ pgfdw_exec_cleanup_query_begin(PGconn *conn, const char *query) */ if (!PQsendQuery(conn, query)) { - pgfdw_report_error(WARNING, NULL, conn, false, query); + pgfdw_report(WARNING, NULL, conn, query); return false; } @@ -1567,7 +1582,7 @@ pgfdw_exec_cleanup_query_end(PGconn *conn, const char *query, */ if (consume_input && !PQconsumeInput(conn)) { - pgfdw_report_error(WARNING, NULL, conn, false, query); + pgfdw_report(WARNING, NULL, conn, query); return false; } @@ -1579,7 +1594,7 @@ pgfdw_exec_cleanup_query_end(PGconn *conn, const char *query, (errmsg("could not get query result due to timeout"), errcontext("remote SQL command: %s", query))); else - pgfdw_report_error(WARNING, NULL, conn, false, query); + pgfdw_report(WARNING, NULL, conn, query); return false; } @@ -1587,7 +1602,7 @@ pgfdw_exec_cleanup_query_end(PGconn *conn, const char *query, /* Issue a warning if not successful. */ if (PQresultStatus(result) != PGRES_COMMAND_OK) { - pgfdw_report_error(WARNING, result, conn, true, query); + pgfdw_report(WARNING, result, conn, query); return ignore_errors; } PQclear(result); @@ -1615,103 +1630,90 @@ pgfdw_get_cleanup_result(PGconn *conn, TimestampTz endtime, PGresult **result, bool *timed_out) { - volatile bool failed = false; - PGresult *volatile last_res = NULL; + bool failed = false; + PGresult *last_res = NULL; + int canceldelta = RETRY_CANCEL_TIMEOUT * 2; *result = NULL; *timed_out = false; - - /* In what follows, do not leak any PGresults on an error. */ - PG_TRY(); + for (;;) { - int canceldelta = RETRY_CANCEL_TIMEOUT * 2; + PGresult *res; - for (;;) + while (PQisBusy(conn)) { - PGresult *res; + int wc; + TimestampTz now = GetCurrentTimestamp(); + long cur_timeout; - while (PQisBusy(conn)) + /* If timeout has expired, give up. */ + if (now >= endtime) { - int wc; - TimestampTz now = GetCurrentTimestamp(); - long cur_timeout; - - /* If timeout has expired, give up. */ - if (now >= endtime) - { - *timed_out = true; - failed = true; - goto exit; - } + *timed_out = true; + failed = true; + goto exit; + } - /* If we need to re-issue the cancel request, do that. */ - if (now >= retrycanceltime) - { - /* We ignore failure to issue the repeated request. */ - (void) libpqsrv_cancel(conn, endtime); + /* If we need to re-issue the cancel request, do that. */ + if (now >= retrycanceltime) + { + /* We ignore failure to issue the repeated request. */ + (void) libpqsrv_cancel(conn, endtime); - /* Recompute "now" in case that took measurable time. */ - now = GetCurrentTimestamp(); + /* Recompute "now" in case that took measurable time. */ + now = GetCurrentTimestamp(); - /* Adjust re-cancel timeout in increasing steps. */ - retrycanceltime = TimestampTzPlusMilliseconds(now, - canceldelta); - canceldelta += canceldelta; - } + /* Adjust re-cancel timeout in increasing steps. */ + retrycanceltime = TimestampTzPlusMilliseconds(now, + canceldelta); + canceldelta += canceldelta; + } - /* If timeout has expired, give up, else get sleep time. */ - cur_timeout = TimestampDifferenceMilliseconds(now, - Min(endtime, - retrycanceltime)); - if (cur_timeout <= 0) - { - *timed_out = true; - failed = true; - goto exit; - } + /* If timeout has expired, give up, else get sleep time. */ + cur_timeout = TimestampDifferenceMilliseconds(now, + Min(endtime, + retrycanceltime)); + if (cur_timeout <= 0) + { + *timed_out = true; + failed = true; + goto exit; + } - /* first time, allocate or get the custom wait event */ - if (pgfdw_we_cleanup_result == 0) - pgfdw_we_cleanup_result = WaitEventExtensionNew("PostgresFdwCleanupResult"); + /* first time, allocate or get the custom wait event */ + if (pgfdw_we_cleanup_result == 0) + pgfdw_we_cleanup_result = WaitEventExtensionNew("PostgresFdwCleanupResult"); - /* Sleep until there's something to do */ - wc = WaitLatchOrSocket(MyLatch, - WL_LATCH_SET | WL_SOCKET_READABLE | - WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, - PQsocket(conn), - cur_timeout, pgfdw_we_cleanup_result); - ResetLatch(MyLatch); + /* Sleep until there's something to do */ + wc = WaitLatchOrSocket(MyLatch, + WL_LATCH_SET | WL_SOCKET_READABLE | + WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + PQsocket(conn), + cur_timeout, pgfdw_we_cleanup_result); + ResetLatch(MyLatch); - CHECK_FOR_INTERRUPTS(); + CHECK_FOR_INTERRUPTS(); - /* Data available in socket? */ - if (wc & WL_SOCKET_READABLE) + /* Data available in socket? */ + if (wc & WL_SOCKET_READABLE) + { + if (!PQconsumeInput(conn)) { - if (!PQconsumeInput(conn)) - { - /* connection trouble */ - failed = true; - goto exit; - } + /* connection trouble */ + failed = true; + goto exit; } } + } - res = PQgetResult(conn); - if (res == NULL) - break; /* query is complete */ + res = PQgetResult(conn); + if (res == NULL) + break; /* query is complete */ - PQclear(last_res); - last_res = res; - } -exit: ; - } - PG_CATCH(); - { PQclear(last_res); - PG_RE_THROW(); + last_res = res; } - PG_END_TRY(); - +exit: if (failed) PQclear(last_res); else diff --git a/contrib/postgres_fdw/deparse.c b/contrib/postgres_fdw/deparse.c index d9970dd675336..e5b5e1a5f51a5 100644 --- a/contrib/postgres_fdw/deparse.c +++ b/contrib/postgres_fdw/deparse.c @@ -39,6 +39,7 @@ #include "catalog/pg_aggregate.h" #include "catalog/pg_authid.h" #include "catalog/pg_collation.h" +#include "catalog/pg_database.h" #include "catalog/pg_namespace.h" #include "catalog/pg_operator.h" #include "catalog/pg_opfamily.h" @@ -160,6 +161,7 @@ static void deparseDistinctExpr(DistinctExpr *node, deparse_expr_cxt *context); static void deparseScalarArrayOpExpr(ScalarArrayOpExpr *node, deparse_expr_cxt *context); static void deparseRelabelType(RelabelType *node, deparse_expr_cxt *context); +static void deparseArrayCoerceExpr(ArrayCoerceExpr *node, deparse_expr_cxt *context); static void deparseBoolExpr(BoolExpr *node, deparse_expr_cxt *context); static void deparseNullTest(NullTest *node, deparse_expr_cxt *context); static void deparseCaseExpr(CaseExpr *node, deparse_expr_cxt *context); @@ -455,6 +457,11 @@ foreign_expr_walker(Node *node, AuthIdRelationId, fpinfo)) return false; break; + case REGDATABASEOID: + if (!is_shippable(DatumGetObjectId(c->constvalue), + DatabaseRelationId, fpinfo)) + return false; + break; } } @@ -696,6 +703,34 @@ foreign_expr_walker(Node *node, state = FDW_COLLATE_UNSAFE; } break; + case T_ArrayCoerceExpr: + { + ArrayCoerceExpr *e = (ArrayCoerceExpr *) node; + + /* + * Recurse to input subexpression. + */ + if (!foreign_expr_walker((Node *) e->arg, + glob_cxt, &inner_cxt, case_arg_cxt)) + return false; + + /* + * T_ArrayCoerceExpr must not introduce a collation not + * derived from an input foreign Var (same logic as for a + * function). + */ + collation = e->resultcollid; + if (collation == InvalidOid) + state = FDW_COLLATE_NONE; + else if (inner_cxt.state == FDW_COLLATE_SAFE && + collation == inner_cxt.collation) + state = FDW_COLLATE_SAFE; + else if (collation == DEFAULT_COLLATION_OID) + state = FDW_COLLATE_NONE; + else + state = FDW_COLLATE_UNSAFE; + } + break; case T_BoolExpr: { BoolExpr *b = (BoolExpr *) node; @@ -2913,6 +2948,9 @@ deparseExpr(Expr *node, deparse_expr_cxt *context) case T_RelabelType: deparseRelabelType((RelabelType *) node, context); break; + case T_ArrayCoerceExpr: + deparseArrayCoerceExpr((ArrayCoerceExpr *) node, context); + break; case T_BoolExpr: deparseBoolExpr((BoolExpr *) node, context); break; @@ -3501,6 +3539,24 @@ deparseRelabelType(RelabelType *node, deparse_expr_cxt *context) node->resulttypmod)); } +/* + * Deparse an ArrayCoerceExpr (array-type conversion) node. + */ +static void +deparseArrayCoerceExpr(ArrayCoerceExpr *node, deparse_expr_cxt *context) +{ + deparseExpr(node->arg, context); + + /* + * No difference how to deparse explicit cast, but if we omit implicit + * cast in the query, it'll be more user-friendly + */ + if (node->coerceformat != COERCE_IMPLICIT_CAST) + appendStringInfo(context->buf, "::%s", + deparse_type_name(node->resulttype, + node->resulttypmod)); +} + /* * Deparse a BoolExpr node. */ diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out index 2185b42bb4f79..4b6e49a5d950d 100644 --- a/contrib/postgres_fdw/expected/postgres_fdw.out +++ b/contrib/postgres_fdw/expected/postgres_fdw.out @@ -710,12 +710,12 @@ EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = -c1; -- Op Remote SQL: SELECT "C 1", c2, c3, c4, c5, c6, c7, c8 FROM "S 1"."T 1" WHERE (("C 1" = (- "C 1"))) (3 rows) -EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE (c1 IS NOT NULL) IS DISTINCT FROM (c1 IS NOT NULL); -- DistinctExpr - QUERY PLAN --------------------------------------------------------------------------------------------------------------------------------------------- +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE (c3 IS NOT NULL) IS DISTINCT FROM (c3 IS NOT NULL); -- DistinctExpr + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------------- Foreign Scan on public.ft1 t1 Output: c1, c2, c3, c4, c5, c6, c7, c8 - Remote SQL: SELECT "C 1", c2, c3, c4, c5, c6, c7, c8 FROM "S 1"."T 1" WHERE ((("C 1" IS NOT NULL) IS DISTINCT FROM ("C 1" IS NOT NULL))) + Remote SQL: SELECT "C 1", c2, c3, c4, c5, c6, c7, c8 FROM "S 1"."T 1" WHERE (((c3 IS NOT NULL) IS DISTINCT FROM (c3 IS NOT NULL))) (3 rows) EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = ANY(ARRAY[c2, 1, c1 + 0]); -- ScalarArrayOpExpr @@ -1180,6 +1180,27 @@ SELECT * FROM ft1 WHERE CASE c3 COLLATE "C" WHEN c6 THEN true ELSE c3 < 'bar' EN Remote SQL: SELECT "C 1", c2, c3, c4, c5, c6, c7, c8 FROM "S 1"."T 1" (4 rows) +-- Test array type conversion pushdown +SET plan_cache_mode = force_generic_plan; +PREPARE s(varchar[]) AS SELECT count(*) FROM ft2 WHERE c6 = ANY ($1); +EXPLAIN (VERBOSE, COSTS OFF) +EXECUTE s(ARRAY['1','2']); + QUERY PLAN +--------------------------------------------------------------------------------------------- + Foreign Scan + Output: (count(*)) + Relations: Aggregate on (public.ft2) + Remote SQL: SELECT count(*) FROM "S 1"."T 1" WHERE ((c6 = ANY ($1::character varying[]))) +(4 rows) + +EXECUTE s(ARRAY['1','2']); + count +------- + 200 +(1 row) + +DEALLOCATE s; +RESET plan_cache_mode; -- a regconfig constant referring to this text search configuration -- is initially unshippable CREATE TEXT SEARCH CONFIGURATION public.custom_search diff --git a/contrib/postgres_fdw/meson.build b/contrib/postgres_fdw/meson.build index 8b29be24deeb7..5c11bc6496fa8 100644 --- a/contrib/postgres_fdw/meson.build +++ b/contrib/postgres_fdw/meson.build @@ -39,7 +39,7 @@ tests += { 'postgres_fdw', 'query_cancel', ], - 'regress_args': ['--dlpath', meson.build_root() / 'src/test/regress'], + 'regress_args': ['--dlpath', meson.project_build_root() / 'src/test/regress'], }, 'tap': { 'tests': [ diff --git a/contrib/postgres_fdw/option.c b/contrib/postgres_fdw/option.c index c2f936640bca8..d6fa89bad9399 100644 --- a/contrib/postgres_fdw/option.c +++ b/contrib/postgres_fdw/option.c @@ -21,6 +21,7 @@ #include "libpq/libpq-be.h" #include "postgres_fdw.h" #include "utils/guc.h" +#include "utils/memutils.h" #include "utils/varlena.h" /* @@ -39,12 +40,6 @@ typedef struct PgFdwOption */ static PgFdwOption *postgres_fdw_options; -/* - * Valid options for libpq. - * Allocated and filled in InitPgFdwOptions. - */ -static PQconninfoOption *libpq_options; - /* * GUC parameters */ @@ -239,6 +234,7 @@ static void InitPgFdwOptions(void) { int num_libpq_opts; + PQconninfoOption *libpq_options; PQconninfoOption *lopt; PgFdwOption *popt; @@ -307,8 +303,8 @@ InitPgFdwOptions(void) * Get list of valid libpq options. * * To avoid unnecessary work, we get the list once and use it throughout - * the lifetime of this backend process. We don't need to care about - * memory context issues, because PQconndefaults allocates with malloc. + * the lifetime of this backend process. Hence, we'll allocate it in + * TopMemoryContext. */ libpq_options = PQconndefaults(); if (!libpq_options) /* assume reason for failure is OOM */ @@ -325,19 +321,11 @@ InitPgFdwOptions(void) /* * Construct an array which consists of all valid options for * postgres_fdw, by appending FDW-specific options to libpq options. - * - * We use plain malloc here to allocate postgres_fdw_options because it - * lives as long as the backend process does. Besides, keeping - * libpq_options in memory allows us to avoid copying every keyword - * string. */ postgres_fdw_options = (PgFdwOption *) - malloc(sizeof(PgFdwOption) * num_libpq_opts + - sizeof(non_libpq_options)); - if (postgres_fdw_options == NULL) - ereport(ERROR, - (errcode(ERRCODE_FDW_OUT_OF_MEMORY), - errmsg("out of memory"))); + MemoryContextAlloc(TopMemoryContext, + sizeof(PgFdwOption) * num_libpq_opts + + sizeof(non_libpq_options)); popt = postgres_fdw_options; for (lopt = libpq_options; lopt->keyword; lopt++) @@ -355,8 +343,8 @@ InitPgFdwOptions(void) if (strncmp(lopt->keyword, "oauth_", strlen("oauth_")) == 0) continue; - /* We don't have to copy keyword string, as described above. */ - popt->keyword = lopt->keyword; + popt->keyword = MemoryContextStrdup(TopMemoryContext, + lopt->keyword); /* * "user" and any secret options are allowed only on user mappings. @@ -371,6 +359,9 @@ InitPgFdwOptions(void) popt++; } + /* Done with libpq's output structure. */ + PQconninfoFree(libpq_options); + /* Append FDW-specific options and dummy terminator. */ memcpy(popt, non_libpq_options, sizeof(non_libpq_options)); } diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c index 4283ce9f96252..456b267f70b5b 100644 --- a/contrib/postgres_fdw/postgres_fdw.c +++ b/contrib/postgres_fdw/postgres_fdw.c @@ -240,7 +240,6 @@ typedef struct PgFdwDirectModifyState PGresult *result; /* result for query */ int num_tuples; /* # of result tuples */ int next_tuple; /* index of next one to return */ - MemoryContextCallback result_cb; /* ensures result will get freed */ Relation resultRel; /* relcache entry for the target relation */ AttrNumber *attnoMap; /* array of attnums of input user columns */ AttrNumber ctidAttno; /* attnum of input ctid column */ @@ -1703,13 +1702,9 @@ postgresReScanForeignScan(ForeignScanState *node) return; } - /* - * We don't use a PG_TRY block here, so be careful not to throw error - * without releasing the PGresult. - */ res = pgfdw_exec_query(fsstate->conn, sql, fsstate->conn_state); if (PQresultStatus(res) != PGRES_COMMAND_OK) - pgfdw_report_error(ERROR, res, fsstate->conn, true, sql); + pgfdw_report_error(res, fsstate->conn, sql); PQclear(res); /* Now force a fresh FETCH. */ @@ -2671,17 +2666,6 @@ postgresBeginDirectModify(ForeignScanState *node, int eflags) dmstate = (PgFdwDirectModifyState *) palloc0(sizeof(PgFdwDirectModifyState)); node->fdw_state = dmstate; - /* - * We use a memory context callback to ensure that the dmstate's PGresult - * (if any) will be released, even if the query fails somewhere that's - * outside our control. The callback is always armed for the duration of - * the query; this relies on PQclear(NULL) being a no-op. - */ - dmstate->result_cb.func = (MemoryContextCallbackFunction) PQclear; - dmstate->result_cb.arg = NULL; - MemoryContextRegisterResetCallback(CurrentMemoryContext, - &dmstate->result_cb); - /* * Identify which user to do the remote access as. This should match what * ExecCheckPermissions() does. @@ -2829,13 +2813,7 @@ postgresEndDirectModify(ForeignScanState *node) return; /* Release PGresult */ - if (dmstate->result) - { - PQclear(dmstate->result); - dmstate->result = NULL; - /* ... and don't forget to disable the callback */ - dmstate->result_cb.arg = NULL; - } + PQclear(dmstate->result); /* Release remote connection */ ReleaseConnection(dmstate->conn); @@ -3507,6 +3485,13 @@ estimate_path_cost_size(PlannerInfo *root, { Assert(foreignrel->reloptkind == RELOPT_UPPER_REL && fpinfo->stage == UPPERREL_GROUP_AGG); + + /* + * We can only get here when this function is called from + * add_foreign_ordered_paths() or add_foreign_final_paths(); + * in which cases, the passed-in fpextra should not be NULL. + */ + Assert(fpextra); adjust_foreign_grouping_path_cost(root, pathkeys, retrieved_rows, width, fpextra->limit_tuples, @@ -3619,41 +3604,32 @@ get_remote_estimate(const char *sql, PGconn *conn, double *rows, int *width, Cost *startup_cost, Cost *total_cost) { - PGresult *volatile res = NULL; - - /* PGresult must be released before leaving this function. */ - PG_TRY(); - { - char *line; - char *p; - int n; + PGresult *res; + char *line; + char *p; + int n; - /* - * Execute EXPLAIN remotely. - */ - res = pgfdw_exec_query(conn, sql, NULL); - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pgfdw_report_error(ERROR, res, conn, false, sql); + /* + * Execute EXPLAIN remotely. + */ + res = pgfdw_exec_query(conn, sql, NULL); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + pgfdw_report_error(res, conn, sql); - /* - * Extract cost numbers for topmost plan node. Note we search for a - * left paren from the end of the line to avoid being confused by - * other uses of parentheses. - */ - line = PQgetvalue(res, 0, 0); - p = strrchr(line, '('); - if (p == NULL) - elog(ERROR, "could not interpret EXPLAIN output: \"%s\"", line); - n = sscanf(p, "(cost=%lf..%lf rows=%lf width=%d)", - startup_cost, total_cost, rows, width); - if (n != 4) - elog(ERROR, "could not interpret EXPLAIN output: \"%s\"", line); - } - PG_FINALLY(); - { - PQclear(res); - } - PG_END_TRY(); + /* + * Extract cost numbers for topmost plan node. Note we search for a left + * paren from the end of the line to avoid being confused by other uses of + * parentheses. + */ + line = PQgetvalue(res, 0, 0); + p = strrchr(line, '('); + if (p == NULL) + elog(ERROR, "could not interpret EXPLAIN output: \"%s\"", line); + n = sscanf(p, "(cost=%lf..%lf rows=%lf width=%d)", + startup_cost, total_cost, rows, width); + if (n != 4) + elog(ERROR, "could not interpret EXPLAIN output: \"%s\"", line); + PQclear(res); } /* @@ -3793,17 +3769,14 @@ create_cursor(ForeignScanState *node) */ if (!PQsendQueryParams(conn, buf.data, numParams, NULL, values, NULL, NULL, 0)) - pgfdw_report_error(ERROR, NULL, conn, false, buf.data); + pgfdw_report_error(NULL, conn, buf.data); /* * Get the result, and check for success. - * - * We don't use a PG_TRY block here, so be careful not to throw error - * without releasing the PGresult. */ res = pgfdw_get_result(conn); if (PQresultStatus(res) != PGRES_COMMAND_OK) - pgfdw_report_error(ERROR, res, conn, true, fsstate->query); + pgfdw_report_error(res, conn, fsstate->query); PQclear(res); /* Mark the cursor as created, and show no tuples have been retrieved */ @@ -3825,7 +3798,10 @@ static void fetch_more_data(ForeignScanState *node) { PgFdwScanState *fsstate = (PgFdwScanState *) node->fdw_state; - PGresult *volatile res = NULL; + PGconn *conn = fsstate->conn; + PGresult *res; + int numrows; + int i; MemoryContext oldcontext; /* @@ -3836,74 +3812,63 @@ fetch_more_data(ForeignScanState *node) MemoryContextReset(fsstate->batch_cxt); oldcontext = MemoryContextSwitchTo(fsstate->batch_cxt); - /* PGresult must be released before leaving this function. */ - PG_TRY(); + if (fsstate->async_capable) { - PGconn *conn = fsstate->conn; - int numrows; - int i; + Assert(fsstate->conn_state->pendingAreq); - if (fsstate->async_capable) - { - Assert(fsstate->conn_state->pendingAreq); + /* + * The query was already sent by an earlier call to + * fetch_more_data_begin. So now we just fetch the result. + */ + res = pgfdw_get_result(conn); + /* On error, report the original query, not the FETCH. */ + if (PQresultStatus(res) != PGRES_TUPLES_OK) + pgfdw_report_error(res, conn, fsstate->query); - /* - * The query was already sent by an earlier call to - * fetch_more_data_begin. So now we just fetch the result. - */ - res = pgfdw_get_result(conn); - /* On error, report the original query, not the FETCH. */ - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pgfdw_report_error(ERROR, res, conn, false, fsstate->query); + /* Reset per-connection state */ + fsstate->conn_state->pendingAreq = NULL; + } + else + { + char sql[64]; - /* Reset per-connection state */ - fsstate->conn_state->pendingAreq = NULL; - } - else - { - char sql[64]; + /* This is a regular synchronous fetch. */ + snprintf(sql, sizeof(sql), "FETCH %d FROM c%u", + fsstate->fetch_size, fsstate->cursor_number); - /* This is a regular synchronous fetch. */ - snprintf(sql, sizeof(sql), "FETCH %d FROM c%u", - fsstate->fetch_size, fsstate->cursor_number); + res = pgfdw_exec_query(conn, sql, fsstate->conn_state); + /* On error, report the original query, not the FETCH. */ + if (PQresultStatus(res) != PGRES_TUPLES_OK) + pgfdw_report_error(res, conn, fsstate->query); + } - res = pgfdw_exec_query(conn, sql, fsstate->conn_state); - /* On error, report the original query, not the FETCH. */ - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pgfdw_report_error(ERROR, res, conn, false, fsstate->query); - } + /* Convert the data into HeapTuples */ + numrows = PQntuples(res); + fsstate->tuples = (HeapTuple *) palloc0(numrows * sizeof(HeapTuple)); + fsstate->num_tuples = numrows; + fsstate->next_tuple = 0; - /* Convert the data into HeapTuples */ - numrows = PQntuples(res); - fsstate->tuples = (HeapTuple *) palloc0(numrows * sizeof(HeapTuple)); - fsstate->num_tuples = numrows; - fsstate->next_tuple = 0; + for (i = 0; i < numrows; i++) + { + Assert(IsA(node->ss.ps.plan, ForeignScan)); - for (i = 0; i < numrows; i++) - { - Assert(IsA(node->ss.ps.plan, ForeignScan)); - - fsstate->tuples[i] = - make_tuple_from_result_row(res, i, - fsstate->rel, - fsstate->attinmeta, - fsstate->retrieved_attrs, - node, - fsstate->temp_cxt); - } + fsstate->tuples[i] = + make_tuple_from_result_row(res, i, + fsstate->rel, + fsstate->attinmeta, + fsstate->retrieved_attrs, + node, + fsstate->temp_cxt); + } - /* Update fetch_ct_2 */ - if (fsstate->fetch_ct_2 < 2) - fsstate->fetch_ct_2++; + /* Update fetch_ct_2 */ + if (fsstate->fetch_ct_2 < 2) + fsstate->fetch_ct_2++; - /* Must be EOF if we didn't get as many tuples as we asked for. */ - fsstate->eof_reached = (numrows < fsstate->fetch_size); - } - PG_FINALLY(); - { - PQclear(res); - } - PG_END_TRY(); + /* Must be EOF if we didn't get as many tuples as we asked for. */ + fsstate->eof_reached = (numrows < fsstate->fetch_size); + + PQclear(res); MemoryContextSwitchTo(oldcontext); } @@ -3977,14 +3942,9 @@ close_cursor(PGconn *conn, unsigned int cursor_number, PGresult *res; snprintf(sql, sizeof(sql), "CLOSE c%u", cursor_number); - - /* - * We don't use a PG_TRY block here, so be careful not to throw error - * without releasing the PGresult. - */ res = pgfdw_exec_query(conn, sql, conn_state); if (PQresultStatus(res) != PGRES_COMMAND_OK) - pgfdw_report_error(ERROR, res, conn, true, sql); + pgfdw_report_error(res, conn, sql); PQclear(res); } @@ -4192,18 +4152,15 @@ execute_foreign_modify(EState *estate, NULL, NULL, 0)) - pgfdw_report_error(ERROR, NULL, fmstate->conn, false, fmstate->query); + pgfdw_report_error(NULL, fmstate->conn, fmstate->query); /* * Get the result, and check for success. - * - * We don't use a PG_TRY block here, so be careful not to throw error - * without releasing the PGresult. */ res = pgfdw_get_result(fmstate->conn); if (PQresultStatus(res) != (fmstate->has_returning ? PGRES_TUPLES_OK : PGRES_COMMAND_OK)) - pgfdw_report_error(ERROR, res, fmstate->conn, true, fmstate->query); + pgfdw_report_error(res, fmstate->conn, fmstate->query); /* Check number of rows affected, and fetch RETURNING tuple if any */ if (fmstate->has_returning) @@ -4262,17 +4219,14 @@ prepare_foreign_modify(PgFdwModifyState *fmstate) fmstate->query, 0, NULL)) - pgfdw_report_error(ERROR, NULL, fmstate->conn, false, fmstate->query); + pgfdw_report_error(NULL, fmstate->conn, fmstate->query); /* * Get the result, and check for success. - * - * We don't use a PG_TRY block here, so be careful not to throw error - * without releasing the PGresult. */ res = pgfdw_get_result(fmstate->conn); if (PQresultStatus(res) != PGRES_COMMAND_OK) - pgfdw_report_error(ERROR, res, fmstate->conn, true, fmstate->query); + pgfdw_report_error(res, fmstate->conn, fmstate->query); PQclear(res); /* This action shows that the prepare has been done. */ @@ -4363,37 +4317,25 @@ convert_prep_stmt_params(PgFdwModifyState *fmstate, /* * store_returning_result * Store the result of a RETURNING clause - * - * On error, be sure to release the PGresult on the way out. Callers do not - * have PG_TRY blocks to ensure this happens. */ static void store_returning_result(PgFdwModifyState *fmstate, TupleTableSlot *slot, PGresult *res) { - PG_TRY(); - { - HeapTuple newtup; + HeapTuple newtup; - newtup = make_tuple_from_result_row(res, 0, - fmstate->rel, - fmstate->attinmeta, - fmstate->retrieved_attrs, - NULL, - fmstate->temp_cxt); + newtup = make_tuple_from_result_row(res, 0, + fmstate->rel, + fmstate->attinmeta, + fmstate->retrieved_attrs, + NULL, + fmstate->temp_cxt); - /* - * The returning slot will not necessarily be suitable to store - * heaptuples directly, so allow for conversion. - */ - ExecForceStoreHeapTuple(newtup, slot, true); - } - PG_CATCH(); - { - PQclear(res); - PG_RE_THROW(); - } - PG_END_TRY(); + /* + * The returning slot will not necessarily be suitable to store heaptuples + * directly, so allow for conversion. + */ + ExecForceStoreHeapTuple(newtup, slot, true); } /* @@ -4429,14 +4371,9 @@ deallocate_query(PgFdwModifyState *fmstate) return; snprintf(sql, sizeof(sql), "DEALLOCATE %s", fmstate->p_name); - - /* - * We don't use a PG_TRY block here, so be careful not to throw error - * without releasing the PGresult. - */ res = pgfdw_exec_query(fmstate->conn, sql, fmstate->conn_state); if (PQresultStatus(res) != PGRES_COMMAND_OK) - pgfdw_report_error(ERROR, res, fmstate->conn, true, sql); + pgfdw_report_error(res, fmstate->conn, sql); PQclear(res); pfree(fmstate->p_name); fmstate->p_name = NULL; @@ -4604,24 +4541,24 @@ execute_dml_stmt(ForeignScanState *node) */ if (!PQsendQueryParams(dmstate->conn, dmstate->query, numParams, NULL, values, NULL, NULL, 0)) - pgfdw_report_error(ERROR, NULL, dmstate->conn, false, dmstate->query); + pgfdw_report_error(NULL, dmstate->conn, dmstate->query); /* * Get the result, and check for success. - * - * We use a memory context callback to ensure that the PGresult will be - * released, even if the query fails somewhere that's outside our control. - * The callback is already registered, just need to fill in its arg. */ - Assert(dmstate->result == NULL); dmstate->result = pgfdw_get_result(dmstate->conn); - dmstate->result_cb.arg = dmstate->result; - if (PQresultStatus(dmstate->result) != (dmstate->has_returning ? PGRES_TUPLES_OK : PGRES_COMMAND_OK)) - pgfdw_report_error(ERROR, dmstate->result, dmstate->conn, false, + pgfdw_report_error(dmstate->result, dmstate->conn, dmstate->query); + /* + * The result potentially needs to survive across multiple executor row + * cycles, so move it to the context where the dmstate is. + */ + dmstate->result = libpqsrv_PGresultSetParent(dmstate->result, + GetMemoryChunkContext(dmstate)); + /* Get the number of rows affected. */ if (dmstate->has_returning) dmstate->num_tuples = PQntuples(dmstate->result); @@ -4958,7 +4895,7 @@ postgresAnalyzeForeignTable(Relation relation, UserMapping *user; PGconn *conn; StringInfoData sql; - PGresult *volatile res = NULL; + PGresult *res; /* Return the row-analysis function pointer */ *func = postgresAcquireSampleRowsFunc; @@ -4984,22 +4921,14 @@ postgresAnalyzeForeignTable(Relation relation, initStringInfo(&sql); deparseAnalyzeSizeSql(&sql, relation); - /* In what follows, do not risk leaking any PGresults. */ - PG_TRY(); - { - res = pgfdw_exec_query(conn, sql.data, NULL); - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pgfdw_report_error(ERROR, res, conn, false, sql.data); + res = pgfdw_exec_query(conn, sql.data, NULL); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + pgfdw_report_error(res, conn, sql.data); - if (PQntuples(res) != 1 || PQnfields(res) != 1) - elog(ERROR, "unexpected result from deparseAnalyzeSizeSql query"); - *totalpages = strtoul(PQgetvalue(res, 0, 0), NULL, 10); - } - PG_FINALLY(); - { - PQclear(res); - } - PG_END_TRY(); + if (PQntuples(res) != 1 || PQnfields(res) != 1) + elog(ERROR, "unexpected result from deparseAnalyzeSizeSql query"); + *totalpages = strtoul(PQgetvalue(res, 0, 0), NULL, 10); + PQclear(res); ReleaseConnection(conn); @@ -5020,9 +4949,9 @@ postgresGetAnalyzeInfoForForeignTable(Relation relation, bool *can_tablesample) UserMapping *user; PGconn *conn; StringInfoData sql; - PGresult *volatile res = NULL; - volatile double reltuples = -1; - volatile char relkind = 0; + PGresult *res; + double reltuples; + char relkind; /* assume the remote relation does not support TABLESAMPLE */ *can_tablesample = false; @@ -5041,24 +4970,15 @@ postgresGetAnalyzeInfoForForeignTable(Relation relation, bool *can_tablesample) initStringInfo(&sql); deparseAnalyzeInfoSql(&sql, relation); - /* In what follows, do not risk leaking any PGresults. */ - PG_TRY(); - { - res = pgfdw_exec_query(conn, sql.data, NULL); - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pgfdw_report_error(ERROR, res, conn, false, sql.data); + res = pgfdw_exec_query(conn, sql.data, NULL); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + pgfdw_report_error(res, conn, sql.data); - if (PQntuples(res) != 1 || PQnfields(res) != 2) - elog(ERROR, "unexpected result from deparseAnalyzeInfoSql query"); - reltuples = strtod(PQgetvalue(res, 0, 0), NULL); - relkind = *(PQgetvalue(res, 0, 1)); - } - PG_FINALLY(); - { - if (res) - PQclear(res); - } - PG_END_TRY(); + if (PQntuples(res) != 1 || PQnfields(res) != 2) + elog(ERROR, "unexpected result from deparseAnalyzeInfoSql query"); + reltuples = strtod(PQgetvalue(res, 0, 0), NULL); + relkind = *(PQgetvalue(res, 0, 1)); + PQclear(res); ReleaseConnection(conn); @@ -5098,10 +5018,12 @@ postgresAcquireSampleRowsFunc(Relation relation, int elevel, int server_version_num; PgFdwSamplingMethod method = ANALYZE_SAMPLE_AUTO; /* auto is default */ double sample_frac = -1.0; - double reltuples; + double reltuples = -1.0; unsigned int cursor_number; StringInfoData sql; - PGresult *volatile res = NULL; + PGresult *res; + char fetch_sql[64]; + int fetch_size; ListCell *lc; /* Initialize workspace state */ @@ -5278,91 +5200,76 @@ postgresAcquireSampleRowsFunc(Relation relation, int elevel, deparseAnalyzeSql(&sql, relation, method, sample_frac, &astate.retrieved_attrs); - /* In what follows, do not risk leaking any PGresults. */ - PG_TRY(); - { - char fetch_sql[64]; - int fetch_size; - - res = pgfdw_exec_query(conn, sql.data, NULL); - if (PQresultStatus(res) != PGRES_COMMAND_OK) - pgfdw_report_error(ERROR, res, conn, false, sql.data); - PQclear(res); - res = NULL; + res = pgfdw_exec_query(conn, sql.data, NULL); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + pgfdw_report_error(res, conn, sql.data); + PQclear(res); - /* - * Determine the fetch size. The default is arbitrary, but shouldn't - * be enormous. - */ - fetch_size = 100; - foreach(lc, server->options) - { - DefElem *def = (DefElem *) lfirst(lc); + /* + * Determine the fetch size. The default is arbitrary, but shouldn't be + * enormous. + */ + fetch_size = 100; + foreach(lc, server->options) + { + DefElem *def = (DefElem *) lfirst(lc); - if (strcmp(def->defname, "fetch_size") == 0) - { - (void) parse_int(defGetString(def), &fetch_size, 0, NULL); - break; - } - } - foreach(lc, table->options) + if (strcmp(def->defname, "fetch_size") == 0) { - DefElem *def = (DefElem *) lfirst(lc); - - if (strcmp(def->defname, "fetch_size") == 0) - { - (void) parse_int(defGetString(def), &fetch_size, 0, NULL); - break; - } + (void) parse_int(defGetString(def), &fetch_size, 0, NULL); + break; } + } + foreach(lc, table->options) + { + DefElem *def = (DefElem *) lfirst(lc); - /* Construct command to fetch rows from remote. */ - snprintf(fetch_sql, sizeof(fetch_sql), "FETCH %d FROM c%u", - fetch_size, cursor_number); - - /* Retrieve and process rows a batch at a time. */ - for (;;) + if (strcmp(def->defname, "fetch_size") == 0) { - int numrows; - int i; + (void) parse_int(defGetString(def), &fetch_size, 0, NULL); + break; + } + } - /* Allow users to cancel long query */ - CHECK_FOR_INTERRUPTS(); + /* Construct command to fetch rows from remote. */ + snprintf(fetch_sql, sizeof(fetch_sql), "FETCH %d FROM c%u", + fetch_size, cursor_number); - /* - * XXX possible future improvement: if rowstoskip is large, we - * could issue a MOVE rather than physically fetching the rows, - * then just adjust rowstoskip and samplerows appropriately. - */ + /* Retrieve and process rows a batch at a time. */ + for (;;) + { + int numrows; + int i; - /* Fetch some rows */ - res = pgfdw_exec_query(conn, fetch_sql, NULL); - /* On error, report the original query, not the FETCH. */ - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pgfdw_report_error(ERROR, res, conn, false, sql.data); + /* Allow users to cancel long query */ + CHECK_FOR_INTERRUPTS(); - /* Process whatever we got. */ - numrows = PQntuples(res); - for (i = 0; i < numrows; i++) - analyze_row_processor(res, i, &astate); + /* + * XXX possible future improvement: if rowstoskip is large, we could + * issue a MOVE rather than physically fetching the rows, then just + * adjust rowstoskip and samplerows appropriately. + */ - PQclear(res); - res = NULL; + /* Fetch some rows */ + res = pgfdw_exec_query(conn, fetch_sql, NULL); + /* On error, report the original query, not the FETCH. */ + if (PQresultStatus(res) != PGRES_TUPLES_OK) + pgfdw_report_error(res, conn, sql.data); - /* Must be EOF if we didn't get all the rows requested. */ - if (numrows < fetch_size) - break; - } + /* Process whatever we got. */ + numrows = PQntuples(res); + for (i = 0; i < numrows; i++) + analyze_row_processor(res, i, &astate); - /* Close the cursor, just to be tidy. */ - close_cursor(conn, cursor_number, NULL); - } - PG_CATCH(); - { PQclear(res); - PG_RE_THROW(); + + /* Must be EOF if we didn't get all the rows requested. */ + if (numrows < fetch_size) + break; } - PG_END_TRY(); + + /* Close the cursor, just to be tidy. */ + close_cursor(conn, cursor_number, NULL); ReleaseConnection(conn); @@ -5474,7 +5381,7 @@ postgresImportForeignSchema(ImportForeignSchemaStmt *stmt, Oid serverOid) UserMapping *mapping; PGconn *conn; StringInfoData buf; - PGresult *volatile res = NULL; + PGresult *res; int numrows, i; ListCell *lc; @@ -5513,243 +5420,231 @@ postgresImportForeignSchema(ImportForeignSchemaStmt *stmt, Oid serverOid) /* Create workspace for strings */ initStringInfo(&buf); - /* In what follows, do not risk leaking any PGresults. */ - PG_TRY(); - { - /* Check that the schema really exists */ - appendStringInfoString(&buf, "SELECT 1 FROM pg_catalog.pg_namespace WHERE nspname = "); - deparseStringLiteral(&buf, stmt->remote_schema); + /* Check that the schema really exists */ + appendStringInfoString(&buf, "SELECT 1 FROM pg_catalog.pg_namespace WHERE nspname = "); + deparseStringLiteral(&buf, stmt->remote_schema); - res = pgfdw_exec_query(conn, buf.data, NULL); - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pgfdw_report_error(ERROR, res, conn, false, buf.data); + res = pgfdw_exec_query(conn, buf.data, NULL); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + pgfdw_report_error(res, conn, buf.data); - if (PQntuples(res) != 1) - ereport(ERROR, - (errcode(ERRCODE_FDW_SCHEMA_NOT_FOUND), - errmsg("schema \"%s\" is not present on foreign server \"%s\"", - stmt->remote_schema, server->servername))); + if (PQntuples(res) != 1) + ereport(ERROR, + (errcode(ERRCODE_FDW_SCHEMA_NOT_FOUND), + errmsg("schema \"%s\" is not present on foreign server \"%s\"", + stmt->remote_schema, server->servername))); - PQclear(res); - res = NULL; - resetStringInfo(&buf); + PQclear(res); + resetStringInfo(&buf); - /* - * Fetch all table data from this schema, possibly restricted by - * EXCEPT or LIMIT TO. (We don't actually need to pay any attention - * to EXCEPT/LIMIT TO here, because the core code will filter the - * statements we return according to those lists anyway. But it - * should save a few cycles to not process excluded tables in the - * first place.) - * - * Import table data for partitions only when they are explicitly - * specified in LIMIT TO clause. Otherwise ignore them and only - * include the definitions of the root partitioned tables to allow - * access to the complete remote data set locally in the schema - * imported. - * - * Note: because we run the connection with search_path restricted to - * pg_catalog, the format_type() and pg_get_expr() outputs will always - * include a schema name for types/functions in other schemas, which - * is what we want. - */ + /* + * Fetch all table data from this schema, possibly restricted by EXCEPT or + * LIMIT TO. (We don't actually need to pay any attention to EXCEPT/LIMIT + * TO here, because the core code will filter the statements we return + * according to those lists anyway. But it should save a few cycles to + * not process excluded tables in the first place.) + * + * Import table data for partitions only when they are explicitly + * specified in LIMIT TO clause. Otherwise ignore them and only include + * the definitions of the root partitioned tables to allow access to the + * complete remote data set locally in the schema imported. + * + * Note: because we run the connection with search_path restricted to + * pg_catalog, the format_type() and pg_get_expr() outputs will always + * include a schema name for types/functions in other schemas, which is + * what we want. + */ + appendStringInfoString(&buf, + "SELECT relname, " + " attname, " + " format_type(atttypid, atttypmod), " + " attnotnull, " + " pg_get_expr(adbin, adrelid), "); + + /* Generated columns are supported since Postgres 12 */ + if (PQserverVersion(conn) >= 120000) appendStringInfoString(&buf, - "SELECT relname, " - " attname, " - " format_type(atttypid, atttypmod), " - " attnotnull, " - " pg_get_expr(adbin, adrelid), "); - - /* Generated columns are supported since Postgres 12 */ - if (PQserverVersion(conn) >= 120000) - appendStringInfoString(&buf, - " attgenerated, "); - else - appendStringInfoString(&buf, - " NULL, "); - - if (import_collate) - appendStringInfoString(&buf, - " collname, " - " collnsp.nspname "); - else - appendStringInfoString(&buf, - " NULL, NULL "); - + " attgenerated, "); + else appendStringInfoString(&buf, - "FROM pg_class c " - " JOIN pg_namespace n ON " - " relnamespace = n.oid " - " LEFT JOIN pg_attribute a ON " - " attrelid = c.oid AND attnum > 0 " - " AND NOT attisdropped " - " LEFT JOIN pg_attrdef ad ON " - " adrelid = c.oid AND adnum = attnum "); - - if (import_collate) - appendStringInfoString(&buf, - " LEFT JOIN pg_collation coll ON " - " coll.oid = attcollation " - " LEFT JOIN pg_namespace collnsp ON " - " collnsp.oid = collnamespace "); + " NULL, "); + if (import_collate) appendStringInfoString(&buf, - "WHERE c.relkind IN (" - CppAsString2(RELKIND_RELATION) "," - CppAsString2(RELKIND_VIEW) "," - CppAsString2(RELKIND_FOREIGN_TABLE) "," - CppAsString2(RELKIND_MATVIEW) "," - CppAsString2(RELKIND_PARTITIONED_TABLE) ") " - " AND n.nspname = "); - deparseStringLiteral(&buf, stmt->remote_schema); - - /* Partitions are supported since Postgres 10 */ - if (PQserverVersion(conn) >= 100000 && - stmt->list_type != FDW_IMPORT_SCHEMA_LIMIT_TO) - appendStringInfoString(&buf, " AND NOT c.relispartition "); - - /* Apply restrictions for LIMIT TO and EXCEPT */ - if (stmt->list_type == FDW_IMPORT_SCHEMA_LIMIT_TO || - stmt->list_type == FDW_IMPORT_SCHEMA_EXCEPT) + " collname, " + " collnsp.nspname "); + else + appendStringInfoString(&buf, + " NULL, NULL "); + + appendStringInfoString(&buf, + "FROM pg_class c " + " JOIN pg_namespace n ON " + " relnamespace = n.oid " + " LEFT JOIN pg_attribute a ON " + " attrelid = c.oid AND attnum > 0 " + " AND NOT attisdropped " + " LEFT JOIN pg_attrdef ad ON " + " adrelid = c.oid AND adnum = attnum "); + + if (import_collate) + appendStringInfoString(&buf, + " LEFT JOIN pg_collation coll ON " + " coll.oid = attcollation " + " LEFT JOIN pg_namespace collnsp ON " + " collnsp.oid = collnamespace "); + + appendStringInfoString(&buf, + "WHERE c.relkind IN (" + CppAsString2(RELKIND_RELATION) "," + CppAsString2(RELKIND_VIEW) "," + CppAsString2(RELKIND_FOREIGN_TABLE) "," + CppAsString2(RELKIND_MATVIEW) "," + CppAsString2(RELKIND_PARTITIONED_TABLE) ") " + " AND n.nspname = "); + deparseStringLiteral(&buf, stmt->remote_schema); + + /* Partitions are supported since Postgres 10 */ + if (PQserverVersion(conn) >= 100000 && + stmt->list_type != FDW_IMPORT_SCHEMA_LIMIT_TO) + appendStringInfoString(&buf, " AND NOT c.relispartition "); + + /* Apply restrictions for LIMIT TO and EXCEPT */ + if (stmt->list_type == FDW_IMPORT_SCHEMA_LIMIT_TO || + stmt->list_type == FDW_IMPORT_SCHEMA_EXCEPT) + { + bool first_item = true; + + appendStringInfoString(&buf, " AND c.relname "); + if (stmt->list_type == FDW_IMPORT_SCHEMA_EXCEPT) + appendStringInfoString(&buf, "NOT "); + appendStringInfoString(&buf, "IN ("); + + /* Append list of table names within IN clause */ + foreach(lc, stmt->table_list) { - bool first_item = true; + RangeVar *rv = (RangeVar *) lfirst(lc); - appendStringInfoString(&buf, " AND c.relname "); - if (stmt->list_type == FDW_IMPORT_SCHEMA_EXCEPT) - appendStringInfoString(&buf, "NOT "); - appendStringInfoString(&buf, "IN ("); + if (first_item) + first_item = false; + else + appendStringInfoString(&buf, ", "); + deparseStringLiteral(&buf, rv->relname); + } + appendStringInfoChar(&buf, ')'); + } - /* Append list of table names within IN clause */ - foreach(lc, stmt->table_list) - { - RangeVar *rv = (RangeVar *) lfirst(lc); + /* Append ORDER BY at the end of query to ensure output ordering */ + appendStringInfoString(&buf, " ORDER BY c.relname, a.attnum"); - if (first_item) - first_item = false; - else - appendStringInfoString(&buf, ", "); - deparseStringLiteral(&buf, rv->relname); - } - appendStringInfoChar(&buf, ')'); - } + /* Fetch the data */ + res = pgfdw_exec_query(conn, buf.data, NULL); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + pgfdw_report_error(res, conn, buf.data); - /* Append ORDER BY at the end of query to ensure output ordering */ - appendStringInfoString(&buf, " ORDER BY c.relname, a.attnum"); + /* Process results */ + numrows = PQntuples(res); + /* note: incrementation of i happens in inner loop's while() test */ + for (i = 0; i < numrows;) + { + char *tablename = PQgetvalue(res, i, 0); + bool first_item = true; - /* Fetch the data */ - res = pgfdw_exec_query(conn, buf.data, NULL); - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pgfdw_report_error(ERROR, res, conn, false, buf.data); + resetStringInfo(&buf); + appendStringInfo(&buf, "CREATE FOREIGN TABLE %s (\n", + quote_identifier(tablename)); - /* Process results */ - numrows = PQntuples(res); - /* note: incrementation of i happens in inner loop's while() test */ - for (i = 0; i < numrows;) + /* Scan all rows for this table */ + do { - char *tablename = PQgetvalue(res, i, 0); - bool first_item = true; + char *attname; + char *typename; + char *attnotnull; + char *attgenerated; + char *attdefault; + char *collname; + char *collnamespace; + + /* If table has no columns, we'll see nulls here */ + if (PQgetisnull(res, i, 1)) + continue; - resetStringInfo(&buf); - appendStringInfo(&buf, "CREATE FOREIGN TABLE %s (\n", - quote_identifier(tablename)); + attname = PQgetvalue(res, i, 1); + typename = PQgetvalue(res, i, 2); + attnotnull = PQgetvalue(res, i, 3); + attdefault = PQgetisnull(res, i, 4) ? NULL : + PQgetvalue(res, i, 4); + attgenerated = PQgetisnull(res, i, 5) ? NULL : + PQgetvalue(res, i, 5); + collname = PQgetisnull(res, i, 6) ? NULL : + PQgetvalue(res, i, 6); + collnamespace = PQgetisnull(res, i, 7) ? NULL : + PQgetvalue(res, i, 7); + + if (first_item) + first_item = false; + else + appendStringInfoString(&buf, ",\n"); - /* Scan all rows for this table */ - do - { - char *attname; - char *typename; - char *attnotnull; - char *attgenerated; - char *attdefault; - char *collname; - char *collnamespace; - - /* If table has no columns, we'll see nulls here */ - if (PQgetisnull(res, i, 1)) - continue; + /* Print column name and type */ + appendStringInfo(&buf, " %s %s", + quote_identifier(attname), + typename); - attname = PQgetvalue(res, i, 1); - typename = PQgetvalue(res, i, 2); - attnotnull = PQgetvalue(res, i, 3); - attdefault = PQgetisnull(res, i, 4) ? NULL : - PQgetvalue(res, i, 4); - attgenerated = PQgetisnull(res, i, 5) ? NULL : - PQgetvalue(res, i, 5); - collname = PQgetisnull(res, i, 6) ? NULL : - PQgetvalue(res, i, 6); - collnamespace = PQgetisnull(res, i, 7) ? NULL : - PQgetvalue(res, i, 7); - - if (first_item) - first_item = false; - else - appendStringInfoString(&buf, ",\n"); + /* + * Add column_name option so that renaming the foreign table's + * column doesn't break the association to the underlying column. + */ + appendStringInfoString(&buf, " OPTIONS (column_name "); + deparseStringLiteral(&buf, attname); + appendStringInfoChar(&buf, ')'); - /* Print column name and type */ - appendStringInfo(&buf, " %s %s", - quote_identifier(attname), - typename); + /* Add COLLATE if needed */ + if (import_collate && collname != NULL && collnamespace != NULL) + appendStringInfo(&buf, " COLLATE %s.%s", + quote_identifier(collnamespace), + quote_identifier(collname)); - /* - * Add column_name option so that renaming the foreign table's - * column doesn't break the association to the underlying - * column. - */ - appendStringInfoString(&buf, " OPTIONS (column_name "); - deparseStringLiteral(&buf, attname); - appendStringInfoChar(&buf, ')'); - - /* Add COLLATE if needed */ - if (import_collate && collname != NULL && collnamespace != NULL) - appendStringInfo(&buf, " COLLATE %s.%s", - quote_identifier(collnamespace), - quote_identifier(collname)); - - /* Add DEFAULT if needed */ - if (import_default && attdefault != NULL && - (!attgenerated || !attgenerated[0])) - appendStringInfo(&buf, " DEFAULT %s", attdefault); - - /* Add GENERATED if needed */ - if (import_generated && attgenerated != NULL && - attgenerated[0] == ATTRIBUTE_GENERATED_STORED) - { - Assert(attdefault != NULL); - appendStringInfo(&buf, - " GENERATED ALWAYS AS (%s) STORED", - attdefault); - } + /* Add DEFAULT if needed */ + if (import_default && attdefault != NULL && + (!attgenerated || !attgenerated[0])) + appendStringInfo(&buf, " DEFAULT %s", attdefault); - /* Add NOT NULL if needed */ - if (import_not_null && attnotnull[0] == 't') - appendStringInfoString(&buf, " NOT NULL"); + /* Add GENERATED if needed */ + if (import_generated && attgenerated != NULL && + attgenerated[0] == ATTRIBUTE_GENERATED_STORED) + { + Assert(attdefault != NULL); + appendStringInfo(&buf, + " GENERATED ALWAYS AS (%s) STORED", + attdefault); } - while (++i < numrows && - strcmp(PQgetvalue(res, i, 0), tablename) == 0); - /* - * Add server name and table-level options. We specify remote - * schema and table name as options (the latter to ensure that - * renaming the foreign table doesn't break the association). - */ - appendStringInfo(&buf, "\n) SERVER %s\nOPTIONS (", - quote_identifier(server->servername)); + /* Add NOT NULL if needed */ + if (import_not_null && attnotnull[0] == 't') + appendStringInfoString(&buf, " NOT NULL"); + } + while (++i < numrows && + strcmp(PQgetvalue(res, i, 0), tablename) == 0); - appendStringInfoString(&buf, "schema_name "); - deparseStringLiteral(&buf, stmt->remote_schema); - appendStringInfoString(&buf, ", table_name "); - deparseStringLiteral(&buf, tablename); + /* + * Add server name and table-level options. We specify remote schema + * and table name as options (the latter to ensure that renaming the + * foreign table doesn't break the association). + */ + appendStringInfo(&buf, "\n) SERVER %s\nOPTIONS (", + quote_identifier(server->servername)); - appendStringInfoString(&buf, ");"); + appendStringInfoString(&buf, "schema_name "); + deparseStringLiteral(&buf, stmt->remote_schema); + appendStringInfoString(&buf, ", table_name "); + deparseStringLiteral(&buf, tablename); - commands = lappend(commands, pstrdup(buf.data)); - } - } - PG_FINALLY(); - { - PQclear(res); + appendStringInfoString(&buf, ");"); + + commands = lappend(commands, pstrdup(buf.data)); } - PG_END_TRY(); + PQclear(res); ReleaseConnection(conn); @@ -7417,7 +7312,7 @@ postgresForeignAsyncNotify(AsyncRequest *areq) /* On error, report the original query, not the FETCH. */ if (!PQconsumeInput(fsstate->conn)) - pgfdw_report_error(ERROR, NULL, fsstate->conn, false, fsstate->query); + pgfdw_report_error(NULL, fsstate->conn, fsstate->query); fetch_more_data(node); @@ -7516,7 +7411,7 @@ fetch_more_data_begin(AsyncRequest *areq) fsstate->fetch_size, fsstate->cursor_number); if (!PQsendQuery(fsstate->conn, sql)) - pgfdw_report_error(ERROR, NULL, fsstate->conn, false, fsstate->query); + pgfdw_report_error(NULL, fsstate->conn, fsstate->query); /* Remember that the request is in process */ fsstate->conn_state->pendingAreq = areq; diff --git a/contrib/postgres_fdw/postgres_fdw.h b/contrib/postgres_fdw/postgres_fdw.h index 81358f3bde7df..e69735298d78f 100644 --- a/contrib/postgres_fdw/postgres_fdw.h +++ b/contrib/postgres_fdw/postgres_fdw.h @@ -15,7 +15,7 @@ #include "foreign/foreign.h" #include "lib/stringinfo.h" -#include "libpq-fe.h" +#include "libpq/libpq-be-fe.h" #include "nodes/execnodes.h" #include "nodes/pathnodes.h" #include "utils/relcache.h" @@ -166,8 +166,10 @@ extern void do_sql_command(PGconn *conn, const char *sql); extern PGresult *pgfdw_get_result(PGconn *conn); extern PGresult *pgfdw_exec_query(PGconn *conn, const char *query, PgFdwConnState *state); -extern void pgfdw_report_error(int elevel, PGresult *res, PGconn *conn, - bool clear, const char *sql); +pg_noreturn extern void pgfdw_report_error(PGresult *res, PGconn *conn, + const char *sql); +extern void pgfdw_report(int elevel, PGresult *res, PGconn *conn, + const char *sql); /* in option.c */ extern int ExtractConnectionOptions(List *defelems, diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql index e534b40de3c76..31b6c685b551b 100644 --- a/contrib/postgres_fdw/sql/postgres_fdw.sql +++ b/contrib/postgres_fdw/sql/postgres_fdw.sql @@ -352,7 +352,7 @@ EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c3 IS NULL; -- Nu EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c3 IS NOT NULL; -- NullTest EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE round(abs(c1), 0) = 1; -- FuncExpr EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = -c1; -- OpExpr(l) -EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE (c1 IS NOT NULL) IS DISTINCT FROM (c1 IS NOT NULL); -- DistinctExpr +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE (c3 IS NOT NULL) IS DISTINCT FROM (c3 IS NOT NULL); -- DistinctExpr EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = ANY(ARRAY[c2, 1, c1 + 0]); -- ScalarArrayOpExpr EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = (ARRAY[c1,c2,3])[1]; -- SubscriptingRef EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c6 = E'foo''s\\bar'; -- check special chars @@ -458,6 +458,15 @@ SELECT * FROM ft1 WHERE CASE c3 WHEN c6 THEN true ELSE c3 < 'bar' END; EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 WHERE CASE c3 COLLATE "C" WHEN c6 THEN true ELSE c3 < 'bar' END; +-- Test array type conversion pushdown +SET plan_cache_mode = force_generic_plan; +PREPARE s(varchar[]) AS SELECT count(*) FROM ft2 WHERE c6 = ANY ($1); +EXPLAIN (VERBOSE, COSTS OFF) +EXECUTE s(ARRAY['1','2']); +EXECUTE s(ARRAY['1','2']); +DEALLOCATE s; +RESET plan_cache_mode; + -- a regconfig constant referring to this text search configuration -- is initially unshippable CREATE TEXT SEARCH CONFIGURATION public.custom_search diff --git a/contrib/spi/refint.c b/contrib/spi/refint.c index d5e25e07ae9e2..89898cad7b0d7 100644 --- a/contrib/spi/refint.c +++ b/contrib/spi/refint.c @@ -321,7 +321,7 @@ check_foreign_key(PG_FUNCTION_ARGS) if (nrefs < 1) /* internal error */ elog(ERROR, "check_foreign_key: %d (< 1) number of references specified", nrefs); - action = tolower((unsigned char) *(args[1])); + action = pg_ascii_tolower((unsigned char) *(args[1])); if (action != 'r' && action != 'c' && action != 's') /* internal error */ elog(ERROR, "check_foreign_key: invalid action %s", args[1]); diff --git a/contrib/xml2/xpath.c b/contrib/xml2/xpath.c index 23d3f332dbaa7..4ac291c8251f7 100644 --- a/contrib/xml2/xpath.c +++ b/contrib/xml2/xpath.c @@ -51,8 +51,8 @@ static text *pgxml_result_to_text(xmlXPathObjectPtr res, xmlChar *toptag, static xmlChar *pgxml_texttoxmlchar(text *textstring); -static xmlXPathObjectPtr pgxml_xpath(text *document, xmlChar *xpath, - xpath_workspace *workspace); +static xpath_workspace *pgxml_xpath(text *document, xmlChar *xpath, + PgXmlErrorContext *xmlerrcxt); static void cleanup_workspace(xpath_workspace *workspace); @@ -88,19 +88,41 @@ Datum xml_encode_special_chars(PG_FUNCTION_ARGS) { text *tin = PG_GETARG_TEXT_PP(0); - text *tout; - xmlChar *ts, - *tt; + text *volatile tout = NULL; + xmlChar *volatile tt = NULL; + PgXmlErrorContext *xmlerrcxt; + + xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); + + PG_TRY(); + { + xmlChar *ts; - ts = pgxml_texttoxmlchar(tin); + ts = pgxml_texttoxmlchar(tin); + + tt = xmlEncodeSpecialChars(NULL, ts); + if (tt == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate xmlChar"); + pfree(ts); + + tout = cstring_to_text((char *) tt); + } + PG_CATCH(); + { + if (tt != NULL) + xmlFree(tt); - tt = xmlEncodeSpecialChars(NULL, ts); + pg_xml_done(xmlerrcxt, true); - pfree(ts); + PG_RE_THROW(); + } + PG_END_TRY(); - tout = cstring_to_text((char *) tt); + if (tt != NULL) + xmlFree(tt); - xmlFree(tt); + pg_xml_done(xmlerrcxt, false); PG_RETURN_TEXT_P(tout); } @@ -122,62 +144,89 @@ pgxmlNodeSetToText(xmlNodeSetPtr nodeset, xmlChar *septagname, xmlChar *plainsep) { - xmlBufferPtr buf; - xmlChar *result; - int i; + volatile xmlBufferPtr buf = NULL; + xmlChar *volatile result = NULL; + PgXmlErrorContext *xmlerrcxt; - buf = xmlBufferCreate(); + /* spin up some error handling */ + xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); - if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0)) - { - xmlBufferWriteChar(buf, "<"); - xmlBufferWriteCHAR(buf, toptagname); - xmlBufferWriteChar(buf, ">"); - } - if (nodeset != NULL) + PG_TRY(); { - for (i = 0; i < nodeset->nodeNr; i++) - { - if (plainsep != NULL) - { - xmlBufferWriteCHAR(buf, - xmlXPathCastNodeToString(nodeset->nodeTab[i])); + buf = xmlBufferCreate(); - /* If this isn't the last entry, write the plain sep. */ - if (i < (nodeset->nodeNr) - 1) - xmlBufferWriteChar(buf, (char *) plainsep); - } - else + if (buf == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate xmlBuffer"); + + if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0)) + { + xmlBufferWriteChar(buf, "<"); + xmlBufferWriteCHAR(buf, toptagname); + xmlBufferWriteChar(buf, ">"); + } + if (nodeset != NULL) + { + for (int i = 0; i < nodeset->nodeNr; i++) { - if ((septagname != NULL) && (xmlStrlen(septagname) > 0)) + if (plainsep != NULL) { - xmlBufferWriteChar(buf, "<"); - xmlBufferWriteCHAR(buf, septagname); - xmlBufferWriteChar(buf, ">"); - } - xmlNodeDump(buf, - nodeset->nodeTab[i]->doc, - nodeset->nodeTab[i], - 1, 0); + xmlBufferWriteCHAR(buf, + xmlXPathCastNodeToString(nodeset->nodeTab[i])); - if ((septagname != NULL) && (xmlStrlen(septagname) > 0)) + /* If this isn't the last entry, write the plain sep. */ + if (i < (nodeset->nodeNr) - 1) + xmlBufferWriteChar(buf, (char *) plainsep); + } + else { - xmlBufferWriteChar(buf, ""); + if ((septagname != NULL) && (xmlStrlen(septagname) > 0)) + { + xmlBufferWriteChar(buf, "<"); + xmlBufferWriteCHAR(buf, septagname); + xmlBufferWriteChar(buf, ">"); + } + xmlNodeDump(buf, + nodeset->nodeTab[i]->doc, + nodeset->nodeTab[i], + 1, 0); + + if ((septagname != NULL) && (xmlStrlen(septagname) > 0)) + { + xmlBufferWriteChar(buf, ""); + } } } } - } - if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0)) + if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0)) + { + xmlBufferWriteChar(buf, ""); + } + + result = xmlStrdup(xmlBufferContent(buf)); + if (result == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate result"); + } + PG_CATCH(); { - xmlBufferWriteChar(buf, ""); + if (buf) + xmlBufferFree(buf); + + pg_xml_done(xmlerrcxt, true); + + PG_RE_THROW(); } - result = xmlStrdup(buf->content); + PG_END_TRY(); + xmlBufferFree(buf); + pg_xml_done(xmlerrcxt, false); + return result; } @@ -207,17 +256,30 @@ xpath_nodeset(PG_FUNCTION_ARGS) xmlChar *toptag = pgxml_texttoxmlchar(PG_GETARG_TEXT_PP(2)); xmlChar *septag = pgxml_texttoxmlchar(PG_GETARG_TEXT_PP(3)); xmlChar *xpath; - text *xpres; - xmlXPathObjectPtr res; - xpath_workspace workspace; + text *volatile xpres = NULL; + xpath_workspace *volatile workspace = NULL; + PgXmlErrorContext *xmlerrcxt; xpath = pgxml_texttoxmlchar(xpathsupp); + xmlerrcxt = pgxml_parser_init(PG_XML_STRICTNESS_LEGACY); - res = pgxml_xpath(document, xpath, &workspace); + PG_TRY(); + { + workspace = pgxml_xpath(document, xpath, xmlerrcxt); + xpres = pgxml_result_to_text(workspace->res, toptag, septag, NULL); + } + PG_CATCH(); + { + if (workspace) + cleanup_workspace(workspace); - xpres = pgxml_result_to_text(res, toptag, septag, NULL); + pg_xml_done(xmlerrcxt, true); + PG_RE_THROW(); + } + PG_END_TRY(); - cleanup_workspace(&workspace); + cleanup_workspace(workspace); + pg_xml_done(xmlerrcxt, false); pfree(xpath); @@ -239,17 +301,30 @@ xpath_list(PG_FUNCTION_ARGS) text *xpathsupp = PG_GETARG_TEXT_PP(1); /* XPath expression */ xmlChar *plainsep = pgxml_texttoxmlchar(PG_GETARG_TEXT_PP(2)); xmlChar *xpath; - text *xpres; - xmlXPathObjectPtr res; - xpath_workspace workspace; + text *volatile xpres = NULL; + xpath_workspace *volatile workspace = NULL; + PgXmlErrorContext *xmlerrcxt; xpath = pgxml_texttoxmlchar(xpathsupp); + xmlerrcxt = pgxml_parser_init(PG_XML_STRICTNESS_LEGACY); - res = pgxml_xpath(document, xpath, &workspace); + PG_TRY(); + { + workspace = pgxml_xpath(document, xpath, xmlerrcxt); + xpres = pgxml_result_to_text(workspace->res, NULL, NULL, plainsep); + } + PG_CATCH(); + { + if (workspace) + cleanup_workspace(workspace); - xpres = pgxml_result_to_text(res, NULL, NULL, plainsep); + pg_xml_done(xmlerrcxt, true); + PG_RE_THROW(); + } + PG_END_TRY(); - cleanup_workspace(&workspace); + cleanup_workspace(workspace); + pg_xml_done(xmlerrcxt, false); pfree(xpath); @@ -268,9 +343,9 @@ xpath_string(PG_FUNCTION_ARGS) text *xpathsupp = PG_GETARG_TEXT_PP(1); /* XPath expression */ xmlChar *xpath; int32 pathsize; - text *xpres; - xmlXPathObjectPtr res; - xpath_workspace workspace; + text *volatile xpres = NULL; + xpath_workspace *volatile workspace = NULL; + PgXmlErrorContext *xmlerrcxt; pathsize = VARSIZE_ANY_EXHDR(xpathsupp); @@ -286,11 +361,25 @@ xpath_string(PG_FUNCTION_ARGS) xpath[pathsize + 7] = ')'; xpath[pathsize + 8] = '\0'; - res = pgxml_xpath(document, xpath, &workspace); + xmlerrcxt = pgxml_parser_init(PG_XML_STRICTNESS_LEGACY); + + PG_TRY(); + { + workspace = pgxml_xpath(document, xpath, xmlerrcxt); + xpres = pgxml_result_to_text(workspace->res, NULL, NULL, NULL); + } + PG_CATCH(); + { + if (workspace) + cleanup_workspace(workspace); - xpres = pgxml_result_to_text(res, NULL, NULL, NULL); + pg_xml_done(xmlerrcxt, true); + PG_RE_THROW(); + } + PG_END_TRY(); - cleanup_workspace(&workspace); + cleanup_workspace(workspace); + pg_xml_done(xmlerrcxt, false); pfree(xpath); @@ -308,24 +397,38 @@ xpath_number(PG_FUNCTION_ARGS) text *document = PG_GETARG_TEXT_PP(0); text *xpathsupp = PG_GETARG_TEXT_PP(1); /* XPath expression */ xmlChar *xpath; - float4 fRes; - xmlXPathObjectPtr res; - xpath_workspace workspace; + volatile float4 fRes = 0.0; + volatile bool isNull = false; + xpath_workspace *volatile workspace = NULL; + PgXmlErrorContext *xmlerrcxt; xpath = pgxml_texttoxmlchar(xpathsupp); + xmlerrcxt = pgxml_parser_init(PG_XML_STRICTNESS_LEGACY); - res = pgxml_xpath(document, xpath, &workspace); - - pfree(xpath); + PG_TRY(); + { + workspace = pgxml_xpath(document, xpath, xmlerrcxt); + pfree(xpath); - if (res == NULL) - PG_RETURN_NULL(); + if (workspace->res == NULL) + isNull = true; + else + fRes = xmlXPathCastToNumber(workspace->res); + } + PG_CATCH(); + { + if (workspace) + cleanup_workspace(workspace); - fRes = xmlXPathCastToNumber(res); + pg_xml_done(xmlerrcxt, true); + PG_RE_THROW(); + } + PG_END_TRY(); - cleanup_workspace(&workspace); + cleanup_workspace(workspace); + pg_xml_done(xmlerrcxt, false); - if (xmlXPathIsNaN(fRes)) + if (isNull || xmlXPathIsNaN(fRes)) PG_RETURN_NULL(); PG_RETURN_FLOAT4(fRes); @@ -340,22 +443,35 @@ xpath_bool(PG_FUNCTION_ARGS) text *document = PG_GETARG_TEXT_PP(0); text *xpathsupp = PG_GETARG_TEXT_PP(1); /* XPath expression */ xmlChar *xpath; - int bRes; - xmlXPathObjectPtr res; - xpath_workspace workspace; + volatile int bRes = 0; + xpath_workspace *volatile workspace = NULL; + PgXmlErrorContext *xmlerrcxt; xpath = pgxml_texttoxmlchar(xpathsupp); + xmlerrcxt = pgxml_parser_init(PG_XML_STRICTNESS_LEGACY); - res = pgxml_xpath(document, xpath, &workspace); - - pfree(xpath); + PG_TRY(); + { + workspace = pgxml_xpath(document, xpath, xmlerrcxt); + pfree(xpath); - if (res == NULL) - PG_RETURN_BOOL(false); + if (workspace->res == NULL) + bRes = 0; + else + bRes = xmlXPathCastToBoolean(workspace->res); + } + PG_CATCH(); + { + if (workspace) + cleanup_workspace(workspace); - bRes = xmlXPathCastToBoolean(res); + pg_xml_done(xmlerrcxt, true); + PG_RE_THROW(); + } + PG_END_TRY(); - cleanup_workspace(&workspace); + cleanup_workspace(workspace); + pg_xml_done(xmlerrcxt, false); PG_RETURN_BOOL(bRes); } @@ -364,57 +480,39 @@ xpath_bool(PG_FUNCTION_ARGS) /* Core function to evaluate XPath query */ -static xmlXPathObjectPtr -pgxml_xpath(text *document, xmlChar *xpath, xpath_workspace *workspace) +static xpath_workspace * +pgxml_xpath(text *document, xmlChar *xpath, PgXmlErrorContext *xmlerrcxt) { int32 docsize = VARSIZE_ANY_EXHDR(document); - PgXmlErrorContext *xmlerrcxt; xmlXPathCompExprPtr comppath; + xpath_workspace *workspace = (xpath_workspace *) + palloc0(sizeof(xpath_workspace)); workspace->doctree = NULL; workspace->ctxt = NULL; workspace->res = NULL; - xmlerrcxt = pgxml_parser_init(PG_XML_STRICTNESS_LEGACY); - - PG_TRY(); + workspace->doctree = xmlReadMemory((char *) VARDATA_ANY(document), + docsize, NULL, NULL, + XML_PARSE_NOENT); + if (workspace->doctree != NULL) { - workspace->doctree = xmlReadMemory((char *) VARDATA_ANY(document), - docsize, NULL, NULL, - XML_PARSE_NOENT); - if (workspace->doctree != NULL) - { - workspace->ctxt = xmlXPathNewContext(workspace->doctree); - workspace->ctxt->node = xmlDocGetRootElement(workspace->doctree); - - /* compile the path */ - comppath = xmlXPathCtxtCompile(workspace->ctxt, xpath); - if (comppath == NULL) - xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_ARGUMENT_FOR_XQUERY, - "XPath Syntax Error"); + workspace->ctxt = xmlXPathNewContext(workspace->doctree); + workspace->ctxt->node = xmlDocGetRootElement(workspace->doctree); - /* Now evaluate the path expression. */ - workspace->res = xmlXPathCompiledEval(comppath, workspace->ctxt); + /* compile the path */ + comppath = xmlXPathCtxtCompile(workspace->ctxt, xpath); + if (comppath == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_ARGUMENT_FOR_XQUERY, + "XPath Syntax Error"); - xmlXPathFreeCompExpr(comppath); - } - } - PG_CATCH(); - { - cleanup_workspace(workspace); - - pg_xml_done(xmlerrcxt, true); + /* Now evaluate the path expression. */ + workspace->res = xmlXPathCompiledEval(comppath, workspace->ctxt); - PG_RE_THROW(); + xmlXPathFreeCompExpr(comppath); } - PG_END_TRY(); - if (workspace->res == NULL) - cleanup_workspace(workspace); - - pg_xml_done(xmlerrcxt, false); - - return workspace->res; + return workspace; } /* Clean up after processing the result of pgxml_xpath() */ @@ -438,35 +536,60 @@ pgxml_result_to_text(xmlXPathObjectPtr res, xmlChar *septag, xmlChar *plainsep) { - xmlChar *xpresstr; - text *xpres; + xmlChar *volatile xpresstr = NULL; + text *volatile xpres = NULL; + PgXmlErrorContext *xmlerrcxt; if (res == NULL) return NULL; - switch (res->type) - { - case XPATH_NODESET: - xpresstr = pgxmlNodeSetToText(res->nodesetval, - toptag, - septag, plainsep); - break; + /* spin some error handling */ + xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); - case XPATH_STRING: - xpresstr = xmlStrdup(res->stringval); - break; + PG_TRY(); + { + switch (res->type) + { + case XPATH_NODESET: + xpresstr = pgxmlNodeSetToText(res->nodesetval, + toptag, + septag, plainsep); + break; + + case XPATH_STRING: + xpresstr = xmlStrdup(res->stringval); + if (xpresstr == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate result"); + break; + + default: + elog(NOTICE, "unsupported XQuery result: %d", res->type); + xpresstr = xmlStrdup((const xmlChar *) ""); + if (xpresstr == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate result"); + } - default: - elog(NOTICE, "unsupported XQuery result: %d", res->type); - xpresstr = xmlStrdup((const xmlChar *) ""); + /* Now convert this result back to text */ + xpres = cstring_to_text((char *) xpresstr); } + PG_CATCH(); + { + if (xpresstr != NULL) + xmlFree(xpresstr); - /* Now convert this result back to text */ - xpres = cstring_to_text((char *) xpresstr); + pg_xml_done(xmlerrcxt, true); + + PG_RE_THROW(); + } + PG_END_TRY(); /* Free various storage */ xmlFree(xpresstr); + pg_xml_done(xmlerrcxt, false); + return xpres; } @@ -648,11 +771,16 @@ xpath_table(PG_FUNCTION_ARGS) for (j = 0; j < numpaths; j++) { ctxt = xmlXPathNewContext(doctree); + if (ctxt == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, + ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate XPath context"); + ctxt->node = xmlDocGetRootElement(doctree); /* compile the path */ comppath = xmlXPathCtxtCompile(ctxt, xpaths[j]); - if (comppath == NULL) + if (comppath == NULL || pg_xml_error_occurred(xmlerrcxt)) xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_ARGUMENT_FOR_XQUERY, "XPath Syntax Error"); @@ -671,6 +799,10 @@ xpath_table(PG_FUNCTION_ARGS) rownr < res->nodesetval->nodeNr) { resstr = xmlXPathCastNodeToString(res->nodesetval->nodeTab[rownr]); + if (resstr == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, + ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate result"); had_values = true; } else @@ -680,11 +812,19 @@ xpath_table(PG_FUNCTION_ARGS) case XPATH_STRING: resstr = xmlStrdup(res->stringval); + if (resstr == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, + ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate result"); break; default: elog(NOTICE, "unsupported XQuery result: %d", res->type); resstr = xmlStrdup((const xmlChar *) ""); + if (resstr == NULL || pg_xml_error_occurred(xmlerrcxt)) + xml_ereport(xmlerrcxt, + ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate result"); } /* diff --git a/contrib/xml2/xslt_proc.c b/contrib/xml2/xslt_proc.c index b720d89f754ae..53550c7dc2406 100644 --- a/contrib/xml2/xslt_proc.c +++ b/contrib/xml2/xslt_proc.c @@ -48,7 +48,7 @@ xslt_process(PG_FUNCTION_ARGS) text *doct = PG_GETARG_TEXT_PP(0); text *ssheet = PG_GETARG_TEXT_PP(1); - text *result; + text *volatile result = NULL; text *paramstr; const char **params; PgXmlErrorContext *xmlerrcxt; @@ -58,8 +58,7 @@ xslt_process(PG_FUNCTION_ARGS) volatile xsltSecurityPrefsPtr xslt_sec_prefs = NULL; volatile xsltTransformContextPtr xslt_ctxt = NULL; volatile int resstat = -1; - xmlChar *resstr = NULL; - int reslen = 0; + xmlChar *volatile resstr = NULL; if (fcinfo->nargs == 3) { @@ -80,13 +79,14 @@ xslt_process(PG_FUNCTION_ARGS) { xmlDocPtr ssdoc; bool xslt_sec_prefs_error; + int reslen = 0; /* Parse document */ doctree = xmlReadMemory((char *) VARDATA_ANY(doct), VARSIZE_ANY_EXHDR(doct), NULL, NULL, XML_PARSE_NOENT); - if (doctree == NULL) + if (doctree == NULL || pg_xml_error_occurred(xmlerrcxt)) xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT, "error parsing XML document"); @@ -95,14 +95,14 @@ xslt_process(PG_FUNCTION_ARGS) VARSIZE_ANY_EXHDR(ssheet), NULL, NULL, XML_PARSE_NOENT); - if (ssdoc == NULL) + if (ssdoc == NULL || pg_xml_error_occurred(xmlerrcxt)) xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT, "error parsing stylesheet as XML document"); /* After this call we need not free ssdoc separately */ stylesheet = xsltParseStylesheetDoc(ssdoc); - if (stylesheet == NULL) + if (stylesheet == NULL || pg_xml_error_occurred(xmlerrcxt)) xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_ARGUMENT_FOR_XQUERY, "failed to parse stylesheet"); @@ -137,11 +137,15 @@ xslt_process(PG_FUNCTION_ARGS) restree = xsltApplyStylesheetUser(stylesheet, doctree, params, NULL, NULL, xslt_ctxt); - if (restree == NULL) + if (restree == NULL || pg_xml_error_occurred(xmlerrcxt)) xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_ARGUMENT_FOR_XQUERY, "failed to apply stylesheet"); - resstat = xsltSaveResultToString(&resstr, &reslen, restree, stylesheet); + resstat = xsltSaveResultToString((xmlChar **) &resstr, &reslen, + restree, stylesheet); + + if (resstat >= 0) + result = cstring_to_text_with_len((char *) resstr, reslen); } PG_CATCH(); { @@ -155,6 +159,8 @@ xslt_process(PG_FUNCTION_ARGS) xsltFreeStylesheet(stylesheet); if (doctree != NULL) xmlFreeDoc(doctree); + if (resstr != NULL) + xmlFree(resstr); xsltCleanupGlobals(); pg_xml_done(xmlerrcxt, true); @@ -170,17 +176,15 @@ xslt_process(PG_FUNCTION_ARGS) xmlFreeDoc(doctree); xsltCleanupGlobals(); + if (resstr) + xmlFree(resstr); + pg_xml_done(xmlerrcxt, false); /* XXX this is pretty dubious, really ought to throw error instead */ if (resstat < 0) PG_RETURN_NULL(); - result = cstring_to_text_with_len((char *) resstr, reslen); - - if (resstr) - xmlFree(resstr); - PG_RETURN_TEXT_P(result); #else /* !USE_LIBXSLT */ diff --git a/doc/src/sgml/amcheck.sgml b/doc/src/sgml/amcheck.sgml index 211a0ae1945bb..0aff0a6c8c6fc 100644 --- a/doc/src/sgml/amcheck.sgml +++ b/doc/src/sgml/amcheck.sgml @@ -278,8 +278,8 @@ SET client_min_messages = DEBUG1; TOAST table. - This option is known to be slow. Also, if the toast table or its - index is corrupt, checking it against toast values could conceivably + This option is known to be slow. Also, if the TOAST table or its + index is corrupt, checking it against TOAST values could conceivably crash the server, although in many cases this would just produce an error. diff --git a/doc/src/sgml/backup.sgml b/doc/src/sgml/backup.sgml index 25b8904baf7cd..5f7489afbd165 100644 --- a/doc/src/sgml/backup.sgml +++ b/doc/src/sgml/backup.sgml @@ -991,7 +991,7 @@ SELECT pg_backup_start(label => 'label', fast => false); usually preferable as it minimizes the impact on the running system. If you want to start the backup as soon as possible, pass true as the second parameter to pg_backup_start and it will - request an immediate checkpoint, which will finish as fast as possible using + request a fast checkpoint, which will finish as fast as possible using as much I/O as possible. diff --git a/doc/src/sgml/bki.sgml b/doc/src/sgml/bki.sgml index 3cd5bee7ffaf4..53a982bf60d29 100644 --- a/doc/src/sgml/bki.sgml +++ b/doc/src/sgml/bki.sgml @@ -1042,7 +1042,7 @@ $ perl rewrite_dat_with_prokind.pl pg_proc.dat - Define indexes and toast tables. + Define indexes and TOAST tables. diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index fa86c569dc497..97f547b3cc4b2 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -1951,7 +1951,7 @@ SCRAM-SHA-256$<iteration count>:&l The OID of the data type that corresponds to this table's row type, - if any; zero for indexes, sequences, and toast tables, which have + if any; zero for indexes, sequences, and TOAST tables, which have no pg_type entry @@ -3158,7 +3158,7 @@ SCRAM-SHA-256$<iteration count>:&l datcollate text - LC_COLLATE for this database + LC_COLLATE for this database (ignored unless datlocprovider is c) @@ -7971,7 +7971,7 @@ SCRAM-SHA-256$<iteration count>:&l Finish LSN of the transaction whose changes are to be skipped, if a valid - LSN; otherwise 0/0. + LSN; otherwise 0/0000000. @@ -8082,6 +8082,17 @@ SCRAM-SHA-256$<iteration count>:&l + + + subretaindeadtuples bool + + + If true, the information (e.g., dead tuples, commit timestamps, and + origins) on the subscriber that is useful for conflict detection is + retained. + + + subconninfo text diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 5a0e97f6f3158..59b27c3c370e2 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -100,7 +100,7 @@ initdb --locale=sv_SE LC_COLLATE - String sort order + String sort order (ignored unless the provider is libc) LC_CTYPE diff --git a/doc/src/sgml/client-auth.sgml b/doc/src/sgml/client-auth.sgml index 832b616a7bbff..51b95ed04f399 100644 --- a/doc/src/sgml/client-auth.sgml +++ b/doc/src/sgml/client-auth.sgml @@ -1003,8 +1003,9 @@ local db1,db2,@demodbs all md5 the remainder of the field is treated as a regular expression. (See for details of PostgreSQL's regular expression syntax.) The regular - expression can include a single capture, or parenthesized subexpression, - which can then be referenced in the database-username + expression can include a single capture, or parenthesized subexpression. + The portion of the system user name that matched the capture can then + be referenced in the database-username field as \1 (backslash-one). This allows the mapping of multiple user names in a single line, which is particularly useful for simple syntax substitutions. For example, these entries @@ -1022,12 +1023,11 @@ mymap /^(.*)@otherdomain\.com$ guest If the database-username field starts with a slash (/), the remainder of the field is treated - as a regular expression (see - for details of PostgreSQL's regular - expression syntax). It is not possible to use \1 - to use a capture from regular expression on - system-username for a regular expression - on database-username. + as a regular expression. + When the database-username field is a regular + expression, it is not possible to use \1 within it to + refer to a capture from the system-username + field. diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 59a0874528a3a..20ccb2d6b5447 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -4618,10 +4618,12 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"' # Windows - Invalidate replication slots that have remained idle longer than this - duration. If this value is specified without units, it is taken as - minutes. A value of zero (the default) disables the idle timeout - invalidation mechanism. This parameter can only be set in the + Invalidate replication slots that have remained inactive (not used by + a replication connection) + for longer than this duration. + If this value is specified without units, it is taken as seconds. + A value of zero (the default) disables the idle timeout + invalidation mechanism. This parameter can only be set in the postgresql.conf file or on the server command line. @@ -4963,6 +4965,8 @@ ANY num_sync ( + regdatabase + + regdictionary @@ -4878,6 +4882,13 @@ SELECT * FROM pg_attribute english + + regdatabase + pg_database + database name + template1 + + regdictionary pg_ts_dict @@ -5049,8 +5060,8 @@ WHERE ... be dropped without first removing the default expression. The alternative of nextval('my_seq'::text) does not create a dependency. - (regrole is an exception to this property. Constants of this - type are not allowed in stored expressions.) + (regdatabase and regrole are exceptions to this + property. Constants of these types are not allowed in stored expressions.) @@ -5110,7 +5121,7 @@ WHERE ... +(pg_lsn,numeric) and -(pg_lsn,numeric) operators, respectively. Note that the calculated LSN should be in the range of pg_lsn type, - i.e., between 0/0 and + i.e., between 0/00000000 and FFFFFFFF/FFFFFFFF. diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 224d4fe5a9f95..74a16af04ad3b 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -3148,8 +3148,11 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in Converts the first letter of each word to upper case and the - rest to lower case. Words are sequences of alphanumeric - characters separated by non-alphanumeric characters. + rest to lower case. When using the libc locale + provider, words are sequences of alphanumeric characters separated + by non-alphanumeric characters; when using the ICU locale provider, + words are separated according to + Unicode Standard Annex #29. initcap('hi THOMAS') @@ -11247,10 +11250,10 @@ now() statement (more specifically, the time of receipt of the latest command message from the client). statement_timestamp() and transaction_timestamp() - return the same value during the first command of a transaction, but might - differ during subsequent commands. + return the same value during the first statement of a transaction, but might + differ during subsequent statements. clock_timestamp() returns the actual current time, and - therefore its value changes even within a single SQL command. + therefore its value changes even within a single SQL statement. timeofday() is a historical PostgreSQL function. Like clock_timestamp(), it returns the actual current time, @@ -26750,6 +26753,23 @@ SELECT currval(pg_get_serial_sequence('sometable', 'id')); + + + + to_regdatabase + + to_regdatabase ( text ) + regdatabase + + + Translates a textual database name to its OID. A similar result is + obtained by casting the string to type regdatabase (see + ); however, this function will return + NULL rather than throwing an error if the name is + not found. + + + @@ -27687,6 +27707,31 @@ acl | {postgres=arwdDxtm/postgres,foo=r/postgres} details. + + + + + pg_get_multixact_members + + pg_get_multixact_members ( multixid xid ) + setof record + ( xid xid, + mode text ) + + + Returns the transaction ID and lock mode for each member of the + specified multixact ID. The lock modes forupd, + fornokeyupd, sh, and + keysh correspond to the row-level locks + FOR UPDATE, FOR NO KEY UPDATE, + FOR SHARE, and FOR KEY SHARE, + respectively, as described in . Two + additional modes are specific to multixacts: + nokeyupd, used by updates that do not modify key + columns, and upd, used by updates or deletes that + modify key columns. + + @@ -27695,7 +27740,8 @@ acl | {postgres=arwdDxtm/postgres,foo=r/postgres} The internal transaction ID type xid is 32 bits wide and wraps around every 4 billion transactions. However, the functions shown in , except - age and mxid_age, use a + age, mxid_age, and + pg_get_multixact_members, use a 64-bit type xid8 that does not wrap around during the life of an installation and can be converted to xid by casting if required; see for details. @@ -28478,7 +28524,7 @@ acl | {postgres=arwdDxtm/postgres,foo=r/postgres} Returns information about the progress of the WAL summarizer. If the WAL summarizer has never run since the instance was started, then summarized_tli and summarized_lsn - will be 0 and 0/0 respectively; + will be 0 and 0/00000000 respectively; otherwise, they will be the TLI and ending LSN of the last WAL summary file written to disk. If the WAL summarizer is currently running, pending_lsn will be the ending LSN of the last @@ -28930,7 +28976,7 @@ LOG: Grand total: 1651920 bytes in 201 blocks; 622360 free (88 chunks); 1029560 will be stored.) If the optional second parameter is given as true, it specifies executing pg_backup_start as quickly - as possible. This forces an immediate checkpoint which will cause a + as possible. This forces a fast checkpoint which will cause a spike in I/O operations, slowing any concurrently executing queries. @@ -29549,7 +29595,9 @@ postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset Creates a new physical replication slot named - slot_name. The optional second parameter, + slot_name. The name cannot be + pg_conflict_detection as it is reserved for the + conflict detection slot. The optional second parameter, when true, specifies that the LSN for this replication slot be reserved immediately; otherwise the LSN is reserved on first connection from a streaming @@ -29593,7 +29641,9 @@ postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset Creates a new logical (decoding) replication slot named slot_name using the output plugin - plugin. The optional third + plugin. The name cannot be + pg_conflict_detection as it is reserved for + the conflict detection slot. The optional third parameter, temporary, when set to true, specifies that the slot should not be permanently stored to disk and is only meant for use by the current session. Temporary slots are also @@ -29623,6 +29673,8 @@ postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset Copies an existing physical replication slot named src_slot_name to a physical replication slot named dst_slot_name. + The new slot name cannot be pg_conflict_detection, + as it is reserved for the conflict detection. The copied physical slot starts to reserve WAL from the same LSN as the source slot. temporary is optional. If temporary @@ -29645,8 +29697,10 @@ postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset Copies an existing logical replication slot named src_slot_name to a logical replication slot named dst_slot_name, optionally changing - the output plugin and persistence. The copied logical slot starts - from the same LSN as the source logical slot. Both + the output plugin and persistence. The new slot name cannot be + pg_conflict_detection as it is reserved for + the conflict detection. The copied logical slot starts from the same + LSN as the source logical slot. Both temporary and plugin are optional; if they are omitted, the values of the source slot are used. The failover option of the source logical slot @@ -29981,7 +30035,7 @@ postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset logical decoding and must be dropped after promotion. See for details. Note that this function is primarily intended for testing and - debugging purposes and should be used with caution. Additionaly, + debugging purposes and should be used with caution. Additionally, this function cannot be executed if sync_replication_slots is enabled and the slotsync diff --git a/doc/src/sgml/gin.sgml b/doc/src/sgml/gin.sgml index 46e87e01324dd..82410b1fbdfa1 100644 --- a/doc/src/sgml/gin.sgml +++ b/doc/src/sgml/gin.sgml @@ -394,7 +394,11 @@ Pointer extra_data) - Compare a partial-match query key to an index key. Returns an integer + Compare a partial-match query key to an index key. + partial_key is a query key that was returned + by extractQuery with an indication that it + requires partial match, and key is an index entry. + Returns an integer whose sign indicates the result: less than zero means the index key does not match the query, but the index scan should continue; zero means that the index key does match the query; greater than zero diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index 1aa4741a8eaee..63d7e376f195e 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -147,7 +147,7 @@ typedef struct IndexAmRoutine ambuild_function ambuild; ambuildempty_function ambuildempty; aminsert_function aminsert; - aminsertcleanup_function aminsertcleanup; + aminsertcleanup_function aminsertcleanup; /* can be NULL */ ambulkdelete_function ambulkdelete; amvacuumcleanup_function amvacuumcleanup; amcanreturn_function amcanreturn; /* can be NULL */ diff --git a/doc/src/sgml/installation.sgml b/doc/src/sgml/installation.sgml index de19f3ad92952..8e5da767c48b2 100644 --- a/doc/src/sgml/installation.sgml +++ b/doc/src/sgml/installation.sgml @@ -65,7 +65,7 @@ - The minimum required version of Meson is 0.54. + The minimum required version of Meson is 0.57.2. @@ -3847,17 +3847,13 @@ make: *** [postgres] Error 1 Both 32-bit and 64-bit builds are possible with the Microsoft Compiler suite. 32-bit PostgreSQL builds are possible with - Visual Studio 2015 to + Visual Studio 2019 to Visual Studio 2022, as well as standalone Windows SDK releases 10 and above. 64-bit PostgreSQL builds are supported with Microsoft Windows SDK version 10 and above or - Visual Studio 2015 and above. + Visual Studio 2019 and above. Errors detected at semantic analysis or later, such as a misspelled table or column name, do not have this effect. + + + Lastly, note that all the statements within the Query message will + observe the same value of statement_timestamp(), + since that timestamp is updated only upon receipt of the Query + message. This will result in them all observing the same + value of transaction_timestamp() as well, + except in cases where the query string ends a previously-started + transaction and begins a new one. + @@ -2225,6 +2235,8 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" The name of the slot to create. Must be a valid replication slot name (see ). + The name cannot be pg_conflict_detection as it + is reserved for the conflict detection. @@ -2643,6 +2655,65 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" + + + Primary status update (B) + + + + Byte1('s') + + + Identifies the message as a primary status update. + + + + + + Int64 + + + The latest WAL write position on the server. + + + + + + Int64 + + + The oldest transaction ID that is currently in the commit phase on + the server, along with its epoch. The most significant 32 bits are + the epoch. The least significant 32 bits are the transaction ID. + If no transactions are active on the server, this number will be + the next transaction ID to be assigned. + + + + + + Int64 + + + The next transaction ID to be assigned on the server, along with + its epoch. The most significant 32 bits are the epoch. The least + significant 32 bits are the transaction ID. + + + + + + Int64 + + + The server's system clock at the time of transmission, as + microseconds since midnight on 2000-01-01. + + + + + + @@ -2787,6 +2858,33 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" + + + Request primary status update (F) + + + + Byte1('p') + + + Identifies the message as a request for a primary status update. + + + + + + Int64 + + + The client's system clock at the time of transmission, as + microseconds since midnight on 2000-01-01. + + + + + + + @@ -3482,6 +3580,7 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" Boolean option to use binary transfer mode. Binary mode is faster than the text mode but slightly less robust. + The default is off. @@ -3494,6 +3593,7 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" Boolean option to enable sending the messages that are written by pg_logical_emit_message. + The default is off. @@ -3504,11 +3604,13 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" - Boolean option to enable streaming of in-progress transactions. - It accepts an additional value "parallel" to enable sending extra - information with some messages to be used for parallelisation. - Minimum protocol version 2 is required to turn it on. Minimum protocol - version 4 is required for the "parallel" option. + Option to enable streaming of in-progress transactions. Valid values are + off (the default), on and + parallel. The setting parallel + enables sending extra information with some messages to be used for + parallelization. Minimum protocol version 2 is required to turn it + on. Minimum protocol version 4 is required for the + parallel value. @@ -3521,6 +3623,7 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" Boolean option to enable two-phase transactions. Minimum protocol version 3 is required to turn it on. + The default is off. @@ -3537,6 +3640,7 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" to send the changes regardless of their origin. This can be used to avoid loops (infinite replication of the same data) among replication nodes. + The default is any. @@ -6081,13 +6185,14 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" - Int32(196608) + Int32 The protocol version number. The most significant 16 bits are - the major version number (3 for the protocol described here). - The least significant 16 bits are the minor version number - (0 for the protocol described here). + the major version number. The least significant 16 bits are the minor + version number. As an example protocol version 3.2 is represented as + 196610 in decimal or more clearly as + 0x00030002 in hexadecimal. diff --git a/doc/src/sgml/query.sgml b/doc/src/sgml/query.sgml index 727a0cb185fb2..b190f28d41ea6 100644 --- a/doc/src/sgml/query.sgml +++ b/doc/src/sgml/query.sgml @@ -264,8 +264,18 @@ COPY weather FROM '/home/user/weather.txt'; where the file name for the source file must be available on the machine running the backend process, not the client, since the backend process - reads the file directly. You can read more about the - COPY command in . + reads the file directly. The data inserted above into the weather table + could also be inserted from a file containing (values are separated by a + tab character): + + +San Francisco 46 50 0.25 1994-11-27 +San Francisco 43 57 0.0 1994-11-29 +Hayward 37 54 \N 1994-11-29 + + + You can read more about the COPY command in + . diff --git a/doc/src/sgml/ref/alter_subscription.sgml b/doc/src/sgml/ref/alter_subscription.sgml index fdc648d007f1c..d48cdc76bd34d 100644 --- a/doc/src/sgml/ref/alter_subscription.sgml +++ b/doc/src/sgml/ref/alter_subscription.sgml @@ -235,8 +235,9 @@ ALTER SUBSCRIPTION name RENAME TO < password_required, run_as_owner, origin, - failover, and - two_phase. + failover, + two_phase, and + retain_dead_tuples. Only a superuser can set password_required = false. @@ -261,8 +262,9 @@ ALTER SUBSCRIPTION name RENAME TO < - The failover - and two_phase + The failover, + two_phase, and + retain_dead_tuples parameters can only be altered when the subscription is disabled. @@ -285,6 +287,14 @@ ALTER SUBSCRIPTION name RENAME TO < option is changed from true to false, the publisher will replicate the transactions again when they are committed. + + + If the retain_dead_tuples + option is altered to false and no other subscription + has this option enabled, the replication slot named + pg_conflict_detection, created to retain + dead tuples for conflict detection, will be dropped. + diff --git a/doc/src/sgml/ref/alter_table.sgml b/doc/src/sgml/ref/alter_table.sgml index d16969916835d..1e4f26c13f650 100644 --- a/doc/src/sgml/ref/alter_table.sgml +++ b/doc/src/sgml/ref/alter_table.sgml @@ -852,7 +852,7 @@ WITH ( MODULUS numeric_literal, REM SHARE UPDATE EXCLUSIVE lock will be taken for - fillfactor, toast and autovacuum storage parameters, as well as the + fillfactor, TOAST and autovacuum storage parameters, as well as the planner parameter parallel_workers. diff --git a/doc/src/sgml/ref/checkpoint.sgml b/doc/src/sgml/ref/checkpoint.sgml index db011a47d0458..cd981cf2cab9f 100644 --- a/doc/src/sgml/ref/checkpoint.sgml +++ b/doc/src/sgml/ref/checkpoint.sgml @@ -21,7 +21,12 @@ PostgreSQL documentation -CHECKPOINT +CHECKPOINT [ ( option [, ...] ) ] + +where option can be one of: + + FLUSH_UNLOGGED [ boolean ] + MODE { FAST | SPREAD } @@ -37,14 +42,24 @@ CHECKPOINT - The CHECKPOINT command forces an immediate + By default, the CHECKPOINT command forces a fast checkpoint when the command is issued, without waiting for a regular checkpoint scheduled by the system (controlled by the settings in ). + To request the checkpoint be spread over a longer interval, set the + MODE option to SPREAD. CHECKPOINT is not intended for use during normal operation. + + The server may consolidate concurrently requested checkpoints. Such + consolidated requests will contain a combined set of options. For example, + if one session requests a fast checkpoint and another requests a spread + checkpoint, the server may combine those requests and perform one fast + checkpoint. + + If executed during recovery, the CHECKPOINT command will force a restartpoint (see ) @@ -58,6 +73,55 @@ CHECKPOINT + + Parameters + + + + FLUSH_UNLOGGED + + + Normally, CHECKPOINT does not flush dirty buffers of + unlogged relations. This option, which is disabled by default, enables + flushing unlogged relations to disk. + + + + + + MODE + + + When set to FAST, which is the default, the requested + checkpoint will be completed as fast as possible, which may result in a + significantly higher rate of I/O during the checkpoint. + + + MODE can also be set to SPREAD to + request the checkpoint be spread over a longer interval (controlled via + the settings in ), like a + regular checkpoint scheduled by the system. This can reduce the rate of + I/O during the checkpoint. + + + + + + boolean + + + Specifies whether the selected option should be turned on or off. + You can write TRUE, ON, or + 1 to enable the option, and FALSE, + OFF, or 0 to disable it. The + boolean value can also + be omitted, in which case TRUE is assumed. + + + + + + Compatibility diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml index 8433344e5b6f5..c2d1fbc1fbe94 100644 --- a/doc/src/sgml/ref/copy.sgml +++ b/doc/src/sgml/ref/copy.sgml @@ -37,7 +37,7 @@ COPY { table_name [ ( delimiter_character' NULL 'null_string' DEFAULT 'default_string' - HEADER [ boolean | MATCH ] + HEADER [ boolean | integer | MATCH ] QUOTE 'quote_character' ESCAPE 'escape_character' FORCE_QUOTE { ( column_name [, ...] ) | * } @@ -212,6 +212,15 @@ COPY { table_name [ ( + + integer + + + Specifies a non-negative integer value passed to the selected option. + + + + FORMAT @@ -303,16 +312,25 @@ COPY { table_name [ ( HEADER - Specifies that the file contains a header line with the names of each - column in the file. On output, the first line contains the column - names from the table. On input, the first line is discarded when this - option is set to true (or equivalent Boolean value). - If this option is set to MATCH, the number and names - of the columns in the header line must match the actual column names of - the table, in order; otherwise an error is raised. + On output, if this option is set to true + (or an equivalent Boolean value), the first line of the output will + contain the column names from the table. + Integer values 0 and 1 are + accepted as Boolean values, but other integers are not allowed for + COPY TO commands. + + + On input, if this option is set to true + (or an equivalent Boolean value), the first line of the input is + discarded. If set to a non-negative integer, that number of + lines are discarded. If set to MATCH, the first line + is discarded, and it must contain column names that exactly match the + table's columns, in both number and order; otherwise, an error is raised. + The MATCH value is only valid for + COPY FROM commands. + + This option is not allowed when using binary format. - The MATCH option is only valid for COPY - FROM commands. diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml index 4da8aeebb50a2..3544b15efdafa 100644 --- a/doc/src/sgml/ref/create_database.sgml +++ b/doc/src/sgml/ref/create_database.sgml @@ -150,12 +150,12 @@ CREATE DATABASE name Sets the default collation order and character classification in the new database. Collation affects the sort order applied to strings, - e.g., in queries with ORDER BY, as well as the order used in indexes - on text columns. Character classification affects the categorization - of characters, e.g., lower, upper, and digit. Also sets the - associated aspects of the operating system environment, - LC_COLLATE and LC_CTYPE. The - default is the same setting as the template database. See ORDER BY, as well as the + order used in indexes on text columns. Character classification + affects the categorization of characters, e.g., lower, upper, and + digit. Also sets the LC_CTYPE aspect of the + operating system environment. The default is the same setting as the + template database. See and for details. @@ -189,17 +189,16 @@ CREATE DATABASE name lc_collate - Sets LC_COLLATE in the database server's operating - system environment. The default is the setting of if specified, otherwise the same - setting as the template database. See below for additional - restrictions. + If is + libc, sets the default collation order to use in + the new database, overriding the setting . Otherwise, this setting is + ignored. - If is - libc, also sets the default collation order to use - in the new database, overriding the setting . + The default is the setting of + if specified, otherwise the same setting as the template database. + See below for additional restrictions. @@ -208,16 +207,18 @@ CREATE DATABASE name Sets LC_CTYPE in the database server's operating - system environment. The default is the setting of if specified, otherwise the same - setting as the template database. See below for additional - restrictions. + system environment. If is - libc, also sets the default character - classification to use in the new database, overriding the setting - . + libc, sets the default character classification to + use in the new database, overriding the setting . + + + The default is the setting of + if specified, otherwise the same setting as the template database. + See below for additional restrictions. diff --git a/doc/src/sgml/ref/create_operator.sgml b/doc/src/sgml/ref/create_operator.sgml index 3553d36454185..d2ffb1b2a500f 100644 --- a/doc/src/sgml/ref/create_operator.sgml +++ b/doc/src/sgml/ref/create_operator.sgml @@ -23,7 +23,7 @@ PostgreSQL documentation CREATE OPERATOR name ( {FUNCTION|PROCEDURE} = function_name - [, LEFTARG = left_type ] [, RIGHTARG = right_type ] + [, LEFTARG = left_type ] , RIGHTARG = right_type [, COMMUTATOR = com_op ] [, NEGATOR = neg_op ] [, RESTRICT = res_proc ] [, JOIN = join_proc ] [, HASHES ] [, MERGES ] @@ -88,8 +88,8 @@ CREATE OPERATOR name ( For binary operators, both LEFTARG and - RIGHTARG must be defined. For prefix operators only - RIGHTARG should be defined. + RIGHTARG must be defined. For prefix operators, only + RIGHTARG must be defined. The function_name function must have been previously defined using CREATE FUNCTION and must be defined to accept the correct number diff --git a/doc/src/sgml/ref/create_subscription.sgml b/doc/src/sgml/ref/create_subscription.sgml index 57dec28a5df64..b8cd15f32806b 100644 --- a/doc/src/sgml/ref/create_subscription.sgml +++ b/doc/src/sgml/ref/create_subscription.sgml @@ -169,7 +169,9 @@ CREATE SUBSCRIPTION subscription_name Name of the publisher's replication slot to use. The default is - to use the name of the subscription for the slot name. + to use the name of the subscription for the slot name. The name cannot + be pg_conflict_detection as it is reserved for the + conflict detection. @@ -435,6 +437,89 @@ CREATE SUBSCRIPTION subscription_name + + + retain_dead_tuples (boolean) + + + Specifies whether the information (e.g., dead tuples, commit + timestamps, and origins) required for conflict detection on the + subscriber is retained. The default is false. + If set to true, a physical replication slot named + pg_conflict_detection will be + created on the subscriber to prevent the conflict information from + being removed. + + + + Note that the information useful for conflict detection is retained + only after the creation of the slot. You can verify the existence of + this slot by querying pg_replication_slots. + And even if multiple subscriptions on one node enable this option, + only one replication slot will be created. Also, + wal_level must be set to replica + or higher to allow the replication slot to be used. + + + + + Note that the information for conflict detection cannot be purged if + the subscription is disabled; thus, the information will accumulate + until the subscription is enabled. To prevent excessive accumulation, + it is recommended to disable retain_dead_tuples + if the subscription will be inactive for an extended period. + + + + Additionally when enabling retain_dead_tuples for + conflict detection in logical replication, it is important to design the + replication topology to balance data retention requirements with + overall system performance. This option provides minimal performance + overhead when applied appropriately. The following scenarios illustrate + effective usage patterns when enabling this option. + + + + a. Large Tables with Bidirectional Writes: + For large tables subject to concurrent writes on both publisher and + subscriber nodes, publishers can define row filters when creating + publications to segment data. This allows multiple subscriptions + to replicate exclusive subsets of the table in parallel, optimizing + the throughput. + + + + b. Write-Enabled Subscribers: + If a subscriber node is expected to perform write operations, replication + can be structured using multiple publications and subscriptions. By + distributing tables across these publications, the workload is spread among + several apply workers, improving concurrency and reducing contention. + + + + c. Read-Only Subscribers: + In configurations involving single or multiple publisher nodes + performing concurrent write operations, read-only subscriber nodes may + replicate changes without seeing a performance impact if it does index + scan. However, if the subscriber is impacted due to replication lag or + scan performance (say due to sequential scans), it needs to follow one + of the two previous strategies to distribute the workload on the + subscriber. + + + + + This option cannot be enabled if the publisher is a physical standby. + + + + Enabling this option ensures retention of information useful for + conflict detection solely for changes occurring locally on the + publisher. For the changes originating from different origins, + reliable conflict detection cannot be guaranteed. + + + diff --git a/doc/src/sgml/ref/create_trigger.sgml b/doc/src/sgml/ref/create_trigger.sgml index 982ab6f3ee450..ed6d206ae7143 100644 --- a/doc/src/sgml/ref/create_trigger.sgml +++ b/doc/src/sgml/ref/create_trigger.sgml @@ -29,7 +29,7 @@ PostgreSQL documentation CREATE [ OR REPLACE ] [ CONSTRAINT ] TRIGGER name { BEFORE | AFTER | INSTEAD OF } { event [ OR ... ] } ON table_name [ FROM referenced_table_name ] - [ NOT DEFERRABLE | [ DEFERRABLE ] [ INITIALLY IMMEDIATE | INITIALLY DEFERRED ] ] + [ NOT DEFERRABLE | [ DEFERRABLE ] [ INITIALLY IMMEDIATE | INITIALLY DEFERRED ] ] [ ENFORCED ] [ REFERENCING { { OLD | NEW } TABLE [ AS ] transition_relation_name } [ ... ] ] [ FOR [ EACH ] { ROW | STATEMENT } ] [ WHEN ( condition ) ] @@ -321,6 +321,15 @@ UPDATE OF column_name1 [, column_name2 + + ENFORCED + + + This is a noise word. Constraint triggers are always enforced. + + + + REFERENCING diff --git a/doc/src/sgml/ref/createdb.sgml b/doc/src/sgml/ref/createdb.sgml index 5c4e0465ed9da..2ccbe13f39008 100644 --- a/doc/src/sgml/ref/createdb.sgml +++ b/doc/src/sgml/ref/createdb.sgml @@ -136,7 +136,8 @@ PostgreSQL documentation - Specifies the LC_COLLATE setting to be used in this database. + Specifies the LC_COLLATE setting to be used in this database (ignored + unless the locale provider is libc). diff --git a/doc/src/sgml/ref/pg_amcheck.sgml b/doc/src/sgml/ref/pg_amcheck.sgml index 6bfe28799c4e6..ef2bdfd19ae5d 100644 --- a/doc/src/sgml/ref/pg_amcheck.sgml +++ b/doc/src/sgml/ref/pg_amcheck.sgml @@ -41,7 +41,7 @@ PostgreSQL documentation - Only ordinary and toast table relations, materialized views, sequences, and + Only ordinary and TOAST table relations, materialized views, sequences, and btree indexes are currently supported. Other relation types are silently skipped. @@ -276,7 +276,7 @@ PostgreSQL documentation - By default, if a table is checked, its toast table, if any, will also + By default, if a table is checked, its TOAST table, if any, will also be checked, even if it is not explicitly selected by an option such as --table or --relation. This option suppresses that behavior. @@ -306,9 +306,9 @@ PostgreSQL documentation - By default, whenever a toast pointer is encountered in a table, + By default, whenever a TOAST pointer is encountered in a table, a lookup is performed to ensure that it references apparently-valid - entries in the toast table. These checks can be quite slow, and this + entries in the TOAST table. These checks can be quite slow, and this option can be used to skip them. @@ -368,9 +368,9 @@ PostgreSQL documentation End checking at the specified block number. An error will occur if the table relation being checked has fewer than this number of blocks. This option does not apply to indexes, and is probably only useful when - checking a single table relation. If both a regular table and a toast + checking a single table relation. If both a regular table and a TOAST table are checked, this option will apply to both, but higher-numbered - toast blocks may still be accessed while validating toast pointers, + TOAST blocks may still be accessed while validating TOAST pointers, unless that is suppressed using . diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml index 9659f76042c5b..fecee08b0a536 100644 --- a/doc/src/sgml/ref/pg_basebackup.sgml +++ b/doc/src/sgml/ref/pg_basebackup.sgml @@ -500,8 +500,9 @@ PostgreSQL documentation - Sets checkpoint mode to fast (immediate) or spread (the default) + Sets checkpoint mode to fast or spread (see ). + The default is spread. diff --git a/doc/src/sgml/ref/pg_dumpall.sgml b/doc/src/sgml/ref/pg_dumpall.sgml index 8ca68da5a5560..f4cbc8288e3ad 100644 --- a/doc/src/sgml/ref/pg_dumpall.sgml +++ b/doc/src/sgml/ref/pg_dumpall.sgml @@ -16,10 +16,7 @@ PostgreSQL documentation pg_dumpall - - - export a PostgreSQL database cluster as an SQL script or to other formats - + extract a PostgreSQL database cluster into a script file @@ -36,7 +33,7 @@ PostgreSQL documentation pg_dumpall is a utility for writing out (dumping) all PostgreSQL databases - of a cluster into an SQL script file or an archive. The output contains + of a cluster into one script file. The script file contains SQL commands that can be used as input to to restore the databases. It does this by calling for each database in the cluster. @@ -55,16 +52,11 @@ PostgreSQL documentation - Plain text SQL scripts will be written to the standard output. Use the + The SQL script will be written to the standard output. Use the / option or shell operators to redirect it into a file. - - Archives in other formats will be placed in a directory named using the - /, which is required in this case. - - pg_dumpall needs to connect several times to the PostgreSQL server (once per @@ -129,85 +121,10 @@ PostgreSQL documentation Send output to the specified file. If this is omitted, the standard output is used. - Note: This option can only be omitted when is plain - - - - - - Specify the format of dump files. In plain format, all the dump data is - sent in a single text stream. This is the default. - - In all other modes, pg_dumpall first creates two files: - global.dat and map.dat, in the directory - specified by . - The first file contains global data, such as roles and tablespaces. The second - contains a mapping between database oids and names. These files are used by - pg_restore. Data for individual databases is placed in - databases subdirectory, named using the database's oid. - - - - d - directory - - - Output directory-format archives for each database, - suitable for input into pg_restore. The directory - will have database oid as its name. - - - - - - p - plain - - - Output a plain-text SQL script file (the default). - - - - - - c - custom - - - Output a custom-format archive for each database, - suitable for input into pg_restore. The archive - will be named dboid.dmp where dboid is the - oid of the database. - - - - - - t - tar - - - Output a tar-format archive for each database, - suitable for input into pg_restore. The archive - will be named dboid.tar where dboid is the - oid of the database. - - - - - - - Note: see for details - of how the various non plain text archives work. - - - - - diff --git a/doc/src/sgml/ref/pg_recvlogical.sgml b/doc/src/sgml/ref/pg_recvlogical.sgml index f68182266a9fa..263ebdeeab4a8 100644 --- a/doc/src/sgml/ref/pg_recvlogical.sgml +++ b/doc/src/sgml/ref/pg_recvlogical.sgml @@ -53,6 +53,16 @@ PostgreSQL documentation (ControlC) or SIGTERM signal. + + + When pg_recvlogical receives + a SIGHUP signal, it closes the current output file + and opens a new one using the filename specified by + the option. This allows us to rotate + the output file by first renaming the current file and then sending + a SIGHUP signal to + pg_recvlogical. + diff --git a/doc/src/sgml/ref/pg_restore.sgml b/doc/src/sgml/ref/pg_restore.sgml index b649bd3a5ae0f..2abe05d47e936 100644 --- a/doc/src/sgml/ref/pg_restore.sgml +++ b/doc/src/sgml/ref/pg_restore.sgml @@ -18,9 +18,8 @@ PostgreSQL documentation pg_restore - restore PostgreSQL databases from archives - created by pg_dump or - pg_dumpall + restore a PostgreSQL database from an + archive file created by pg_dump @@ -39,14 +38,13 @@ PostgreSQL documentation pg_restore is a utility for restoring a - PostgreSQL database or cluster from an archive - created by or - in one of the non-plain-text + PostgreSQL database from an archive + created by in one of the non-plain-text formats. It will issue the commands necessary to reconstruct the - database or cluster to the state it was in at the time it was saved. The - archives also allow pg_restore to + database to the state it was in at the time it was saved. The + archive files also allow pg_restore to be selective about what is restored, or even to reorder the items - prior to being restored. The archive formats are designed to be + prior to being restored. The archive files are designed to be portable across architectures. @@ -54,17 +52,10 @@ PostgreSQL documentation pg_restore can operate in two modes. If a database name is specified, pg_restore connects to that database and restores archive contents directly into - the database. - When restoring from a dump made by pg_dumpall, - each database will be created and then the restoration will be run in that - database. - - Otherwise, when a database name is not specified, a script containing the SQL - commands necessary to rebuild the database or cluster is created and written + the database. Otherwise, a script containing the SQL + commands necessary to rebuild the database is created and written to a file or standard output. This script output is equivalent to - the plain text output format of pg_dump or - pg_dumpall. - + the plain text output format of pg_dump. Some of the options controlling the output are therefore analogous to pg_dump options. @@ -149,8 +140,6 @@ PostgreSQL documentation commands that mention this database. Access privileges for the database itself are also restored, unless is specified. - is required when restoring multiple databases - from an archive created by pg_dumpall. @@ -246,19 +235,6 @@ PostgreSQL documentation - - - - - - Restore only global objects (roles and tablespaces), no databases. - - - This option is only relevant when restoring from an archive made using pg_dumpall. - - - - @@ -603,28 +579,6 @@ PostgreSQL documentation - - - - - Do not restore databases whose name matches - pattern. - Multiple patterns can be excluded by writing multiple - switches. The - pattern parameter is - interpreted as a pattern according to the same rules used by - psql's \d - commands (see ), - so multiple databases can also be excluded by writing wildcard - characters in the pattern. When using wildcards, be careful to - quote the pattern if needed to prevent shell wildcard expansion. - - - This option is only relevant when restoring from an archive made using pg_dumpall. - - - - diff --git a/doc/src/sgml/ref/pgtesttiming.sgml b/doc/src/sgml/ref/pgtesttiming.sgml index a5eb3aa25e02f..afe6a12be4b30 100644 --- a/doc/src/sgml/ref/pgtesttiming.sgml +++ b/doc/src/sgml/ref/pgtesttiming.sgml @@ -30,11 +30,23 @@ PostgreSQL documentation Description - pg_test_timing is a tool to measure the timing overhead - on your system and confirm that the system time never moves backwards. + pg_test_timing is a tool to measure the + timing overhead on your system and confirm that the system time never + moves backwards. It simply reads the system clock over and over again + as fast as it can for a specified length of time, and then prints + statistics about the observed differences in successive clock readings. + + + Smaller (but not zero) differences are better, since they imply both + more-precise clock hardware and less overhead to collect a clock reading. Systems that are slow to collect timing data can give less accurate EXPLAIN ANALYZE results. + + This tool is also helpful to determine if + the track_io_timing configuration parameter is likely + to produce useful results. + @@ -59,6 +71,21 @@ PostgreSQL documentation + + + + + + Specifies the cutoff percentage for the list of exact observed + timing durations (that is, the changes in the system clock value + from one reading to the next). The list will end once the running + percentage total reaches or exceeds this value, except that the + largest observed duration will always be printed. The default + cutoff is 99.99. + + + + @@ -92,205 +119,83 @@ PostgreSQL documentation Interpreting Results - Good results will show most (>90%) individual timing calls take less than - one microsecond. Average per loop overhead will be even lower, below 100 - nanoseconds. This example from an Intel i7-860 system using a TSC clock - source shows excellent performance: - - + The first block of output has four columns, with rows showing a + shifted-by-one log2(ns) histogram of timing durations (that is, the + differences between successive clock readings). This is not the + classic log2(n+1) histogram as it counts zeros separately and then + switches to log2(ns) starting from value 1. - - Note that different units are used for the per loop time than the - histogram. The loop can have resolution within a few nanoseconds (ns), - while the individual timing calls can only resolve down to one microsecond - (us). + The columns are: + + + nanosecond value that is >= the durations in this + bucket + + + percentage of durations in this bucket + + + running-sum percentage of durations in this and previous + buckets + + + count of durations in this bucket + + - - - - Measuring Executor Timing Overhead - - When the query executor is running a statement using - EXPLAIN ANALYZE, individual operations are timed as well - as showing a summary. The overhead of your system can be checked by - counting rows with the psql program: - - -CREATE TABLE t AS SELECT * FROM generate_series(1,100000); -\timing -SELECT COUNT(*) FROM t; -EXPLAIN ANALYZE SELECT COUNT(*) FROM t; - + The second block of output goes into more detail, showing the exact + timing differences observed. For brevity this list is cut off when the + running-sum percentage exceeds the user-selectable cutoff value. + However, the largest observed difference is always shown. - - The i7-860 system measured runs the count query in 9.8 ms while - the EXPLAIN ANALYZE version takes 16.6 ms, each - processing just over 100,000 rows. That 6.8 ms difference means the timing - overhead per row is 68 ns, about twice what pg_test_timing estimated it - would be. Even that relatively small amount of overhead is making the fully - timed count statement take almost 70% longer. On more substantial queries, - the timing overhead would be less problematic. + The example results below show that 99.99% of timing loops took between + 8 and 31 nanoseconds, with the worst case somewhere between 32768 and + 65535 nanoseconds. In the second block, we can see that typical loop + time is 16 nanoseconds, and the readings appear to have full nanosecond + precision. - - - - Changing Time Sources - On some newer Linux systems, it's possible to change the clock source used - to collect timing data at any time. A second example shows the slowdown - possible from switching to the slower acpi_pm time source, on the same - system used for the fast results above: - /sys/devices/system/clocksource/clocksource0/current_clocksource -# pg_test_timing -Per loop time including overhead: 722.92 ns +Testing timing overhead for 3 seconds. +Average loop time including overhead: 16.40 ns Histogram of timing durations: - < us % of total count - 1 27.84870 1155682 - 2 72.05956 2990371 - 4 0.07810 3241 - 8 0.01357 563 - 16 0.00007 3 + <= ns % of total running % count + 0 0.0000 0.0000 0 + 1 0.0000 0.0000 0 + 3 0.0000 0.0000 0 + 7 0.0000 0.0000 0 + 15 4.5452 4.5452 8313178 + 31 95.4527 99.9979 174581501 + 63 0.0001 99.9981 253 + 127 0.0001 99.9982 165 + 255 0.0000 99.9982 35 + 511 0.0000 99.9982 1 + 1023 0.0013 99.9994 2300 + 2047 0.0004 99.9998 690 + 4095 0.0000 99.9998 9 + 8191 0.0000 99.9998 8 + 16383 0.0002 100.0000 337 + 32767 0.0000 100.0000 2 + 65535 0.0000 100.0000 1 + +Observed timing durations up to 99.9900%: + ns % of total running % count + 15 4.5452 4.5452 8313178 + 16 58.3785 62.9237 106773354 + 17 33.6840 96.6078 61607584 + 18 3.1151 99.7229 5697480 + 19 0.2638 99.9867 482570 + 20 0.0093 99.9960 17054 +... + 38051 0.0000 100.0000 1 ]]> - - In this configuration, the sample EXPLAIN ANALYZE above - takes 115.9 ms. That's 1061 ns of timing overhead, again a small multiple - of what's measured directly by this utility. That much timing overhead - means the actual query itself is only taking a tiny fraction of the - accounted for time, most of it is being consumed in overhead instead. In - this configuration, any EXPLAIN ANALYZE totals involving - many timed operations would be inflated significantly by timing overhead. - - - - FreeBSD also allows changing the time source on the fly, and it logs - information about the timer selected during boot: - - -# dmesg | grep "Timecounter" -Timecounter "ACPI-fast" frequency 3579545 Hz quality 900 -Timecounter "i8254" frequency 1193182 Hz quality 0 -Timecounters tick every 10.000 msec -Timecounter "TSC" frequency 2531787134 Hz quality 800 -# sysctl kern.timecounter.hardware=TSC -kern.timecounter.hardware: ACPI-fast -> TSC - - - - - Other systems may only allow setting the time source on boot. On older - Linux systems the "clock" kernel setting is the only way to make this sort - of change. And even on some more recent ones, the only option you'll see - for a clock source is "jiffies". Jiffies are the older Linux software clock - implementation, which can have good resolution when it's backed by fast - enough timing hardware, as in this example: - - - - - - Clock Hardware and Timing Accuracy - - - Collecting accurate timing information is normally done on computers using - hardware clocks with various levels of accuracy. With some hardware the - operating systems can pass the system clock time almost directly to - programs. A system clock can also be derived from a chip that simply - provides timing interrupts, periodic ticks at some known time interval. In - either case, operating system kernels provide a clock source that hides - these details. But the accuracy of that clock source and how quickly it can - return results varies based on the underlying hardware. - - - - Inaccurate time keeping can result in system instability. Test any change - to the clock source very carefully. Operating system defaults are sometimes - made to favor reliability over best accuracy. And if you are using a virtual - machine, look into the recommended time sources compatible with it. Virtual - hardware faces additional difficulties when emulating timers, and there are - often per operating system settings suggested by vendors. - - - - The Time Stamp Counter (TSC) clock source is the most accurate one available - on current generation CPUs. It's the preferred way to track the system time - when it's supported by the operating system and the TSC clock is - reliable. There are several ways that TSC can fail to provide an accurate - timing source, making it unreliable. Older systems can have a TSC clock that - varies based on the CPU temperature, making it unusable for timing. Trying - to use TSC on some older multicore CPUs can give a reported time that's - inconsistent among multiple cores. This can result in the time going - backwards, a problem this program checks for. And even the newest systems - can fail to provide accurate TSC timing with very aggressive power saving - configurations. - - - - Newer operating systems may check for the known TSC problems and switch to a - slower, more stable clock source when they are seen. If your system - supports TSC time but doesn't default to that, it may be disabled for a good - reason. And some operating systems may not detect all the possible problems - correctly, or will allow using TSC even in situations where it's known to be - inaccurate. - - - - The High Precision Event Timer (HPET) is the preferred timer on systems - where it's available and TSC is not accurate. The timer chip itself is - programmable to allow up to 100 nanosecond resolution, but you may not see - that much accuracy in your system clock. - - - - Advanced Configuration and Power Interface (ACPI) provides a Power - Management (PM) Timer, which Linux refers to as the acpi_pm. The clock - derived from acpi_pm will at best provide 300 nanosecond resolution. - - - - Timers used on older PC hardware include the 8254 Programmable Interval - Timer (PIT), the real-time clock (RTC), the Advanced Programmable Interrupt - Controller (APIC) timer, and the Cyclone timer. These timers aim for - millisecond resolution. - - @@ -298,6 +203,8 @@ Histogram of timing durations: + Wiki + discussion about timing diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml index aeeed297437e6..5ddf3a8ae9257 100644 --- a/doc/src/sgml/ref/pgupgrade.sgml +++ b/doc/src/sgml/ref/pgupgrade.sgml @@ -1110,7 +1110,8 @@ psql --username=postgres --file=script.sql postgres regproc regprocedure - (regclass, regrole, and regtype can be upgraded.) + (regclass, regdatabase, regrole, and + regtype can be upgraded.) diff --git a/doc/src/sgml/ref/psql-ref.sgml b/doc/src/sgml/ref/psql-ref.sgml index 95f4cac2467e3..4f7b11175c671 100644 --- a/doc/src/sgml/ref/psql-ref.sgml +++ b/doc/src/sgml/ref/psql-ref.sgml @@ -4623,6 +4623,15 @@ bar + + SERVICEFILE + + + The service file name, if applicable. + + + + SHELL_ERROR diff --git a/doc/src/sgml/ref/vacuumdb.sgml b/doc/src/sgml/ref/vacuumdb.sgml index b0680a61814cc..c7d9dca17b867 100644 --- a/doc/src/sgml/ref/vacuumdb.sgml +++ b/doc/src/sgml/ref/vacuumdb.sgml @@ -282,9 +282,11 @@ PostgreSQL documentation Only analyze relations that are missing statistics for a column, index - expression, or extended statistics object. This option prevents - vacuumdb from deleting existing statistics - so that the query optimizer's choices do not become transiently worse. + expression, or extended statistics object. When used with + , this option prevents + vacuumdb from temporarily replacing existing + statistics with ones generated with lower statistics targets, thus + avoiding transiently worse query optimizer choices. This option can only be used in conjunction with diff --git a/doc/src/sgml/sepgsql.sgml b/doc/src/sgml/sepgsql.sgml index 03ed7d1c90d15..0708e48bcd950 100644 --- a/doc/src/sgml/sepgsql.sgml +++ b/doc/src/sgml/sepgsql.sgml @@ -442,7 +442,7 @@ UPDATE t1 SET x = 2, y = func1(y) WHERE z = 100; The default database privilege system allows database superusers to modify system catalogs using DML commands, and reference or modify - toast tables. These operations are prohibited when + TOAST tables. These operations are prohibited when sepgsql is enabled. diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml index 986ae1f543dbd..4187191ea7413 100644 --- a/doc/src/sgml/system-views.sgml +++ b/doc/src/sgml/system-views.sgml @@ -81,6 +81,11 @@ open cursors + + pg_dsm_registry_allocations + shared memory allocations tracked in the DSM registry + + pg_file_settings summary of configuration file contents @@ -1086,6 +1091,75 @@ AND c1.path[c2.level] = c2.path[c2.level]; + + <structname>pg_dsm_registry_allocations</structname> + + + pg_dsm_registry_allocations + + + + The pg_dsm_registry_allocations view shows shared + memory allocations tracked in the dynamic shared memory (DSM) registry. + This includes memory allocated by extensions using the mechanisms detailed + in . + + + + <structname>pg_dsm_registry_allocations</structname> Columns + + + + + Column Type + + + Description + + + + + + + + name text + + + The name of the allocation in the DSM registry. + + + + + + type text + + + The type of allocation. Possible values are segment, + area, and hash, which correspond + to dynamic shared memory segments, areas, and hash tables, respectively. + + + + + + size int8 + + + Size of the allocation in bytes. NULL for entries of type + area and hash. + + + + +
+ + + By default, the pg_dsm_registry_allocations view + can be read only by superusers or roles with privileges of the + pg_read_all_stats role. + +
+ <structname>pg_file_settings</structname> @@ -2819,21 +2893,18 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx unreserved means that the slot no longer retains the required WAL files and some of them are to be removed at - the next checkpoint. This state can return + the next checkpoint. This typically occurs when + is set to + a non-negative value. This state can return to reserved or extended.
- lost means that some required WAL files have - been removed and this slot is no longer usable. + lost means that this slot is no longer usable. - The last two states are seen only when - is - non-negative. If restart_lsn is NULL, this - field is null.
@@ -2932,7 +3003,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx idle_timeout means that the slot has remained - idle longer than the configured + inactive longer than the configured duration. diff --git a/doc/src/sgml/test-decoding.sgml b/doc/src/sgml/test-decoding.sgml index 5d1ae8f4f52e2..7d3d590471a32 100644 --- a/doc/src/sgml/test-decoding.sgml +++ b/doc/src/sgml/test-decoding.sgml @@ -25,16 +25,16 @@ postgres=# SELECT * FROM pg_logical_slot_get_changes('test_slot', NULL, NULL, 'include-xids', '0'); - lsn | xid | data ------------+-----+-------------------------------------------------- - 0/16D30F8 | 691 | BEGIN - 0/16D32A0 | 691 | table public.data: INSERT: id[int4]:2 data[text]:'arg' - 0/16D32A0 | 691 | table public.data: INSERT: id[int4]:3 data[text]:'demo' - 0/16D32A0 | 691 | COMMIT - 0/16D32D8 | 692 | BEGIN - 0/16D3398 | 692 | table public.data: DELETE: id[int4]:2 - 0/16D3398 | 692 | table public.data: DELETE: id[int4]:3 - 0/16D3398 | 692 | COMMIT + lsn | xid | data +------------+-----+-------------------------------------------------- + 0/016D30F8 | 691 | BEGIN + 0/016D32A0 | 691 | table public.data: INSERT: id[int4]:2 data[text]:'arg' + 0/016D32A0 | 691 | table public.data: INSERT: id[int4]:3 data[text]:'demo' + 0/016D32A0 | 691 | COMMIT + 0/016D32D8 | 692 | BEGIN + 0/016D3398 | 692 | table public.data: DELETE: id[int4]:2 + 0/016D3398 | 692 | table public.data: DELETE: id[int4]:3 + 0/016D3398 | 692 | COMMIT (8 rows) @@ -45,18 +45,18 @@ postgres=# SELECT * FROM pg_logical_slot_get_changes('test_slot', NULL, NULL, 'i postgres[33712]=#* SELECT * FROM pg_logical_slot_get_changes('test_slot', NULL, NULL, 'stream-changes', '1'); - lsn | xid | data ------------+-----+-------------------------------------------------- - 0/16B21F8 | 503 | opening a streamed block for transaction TXN 503 - 0/16B21F8 | 503 | streaming change for TXN 503 - 0/16B2300 | 503 | streaming change for TXN 503 - 0/16B2408 | 503 | streaming change for TXN 503 - 0/16BEBA0 | 503 | closing a streamed block for transaction TXN 503 - 0/16B21F8 | 503 | opening a streamed block for transaction TXN 503 - 0/16BECA8 | 503 | streaming change for TXN 503 - 0/16BEDB0 | 503 | streaming change for TXN 503 - 0/16BEEB8 | 503 | streaming change for TXN 503 - 0/16BEBA0 | 503 | closing a streamed block for transaction TXN 503 + lsn | xid | data +------------+-----+-------------------------------------------------- + 0/016B21F8 | 503 | opening a streamed block for transaction TXN 503 + 0/016B21F8 | 503 | streaming change for TXN 503 + 0/016B2300 | 503 | streaming change for TXN 503 + 0/016B2408 | 503 | streaming change for TXN 503 + 0/016BEBA0 | 503 | closing a streamed block for transaction TXN 503 + 0/016B21F8 | 503 | opening a streamed block for transaction TXN 503 + 0/016BECA8 | 503 | streaming change for TXN 503 + 0/016BEDB0 | 503 | streaming change for TXN 503 + 0/016BEEB8 | 503 | streaming change for TXN 503 + 0/016BEBA0 | 503 | closing a streamed block for transaction TXN 503 (10 rows) diff --git a/doc/src/sgml/xoper.sgml b/doc/src/sgml/xoper.sgml index 954a90d77d0ed..853b07a9f1489 100644 --- a/doc/src/sgml/xoper.sgml +++ b/doc/src/sgml/xoper.sgml @@ -21,7 +21,7 @@ PostgreSQL supports prefix - and infix operators. Operators can be + and binary (or infix) operators. Operators can be overloaded;overloadingoperators that is, the same operator name can be used for different operators that have different numbers and types of operands. When a query is diff --git a/meson.build b/meson.build index 36e168a1a2ace..ca423dc8e12f3 100644 --- a/meson.build +++ b/meson.build @@ -11,10 +11,11 @@ project('postgresql', version: '19devel', license: 'PostgreSQL', - # We want < 0.56 for python 3.5 compatibility on old platforms. EPEL for - # RHEL 7 has 0.55. < 0.54 would require replacing some uses of the fs - # module, < 0.53 all uses of fs. So far there's no need to go to >=0.56. - meson_version: '>=0.54', + # We want < 0.62 for python 3.6 compatibility on old platforms. + # RHEL 8 has 0.58. < 0.57 would require various additional + # backward-compatibility conditionals. + # Meson 0.57.0 and 0.57.1 are buggy, therefore >=0.57.2. + meson_version: '>=0.57.2', default_options: [ 'warning_level=1', #-Wall equivalent 'b_pch=false', @@ -279,6 +280,10 @@ elif host_system == 'windows' # define before including for getting localtime_r() etc. on MinGW cppflags += '-D_POSIX_C_SOURCE' endif + if cc.get_id() == 'msvc' + # required for VA_ARGS_NARGS() in c.h; requires VS 2019 + cppflags += '/Zc:preprocessor' + endif export_file_format = 'win' export_file_suffix = 'def' @@ -943,10 +948,10 @@ if not libcurlopt.disabled() # libcurl and one of either epoll or kqueue. oauth_flow_supported = ( libcurl.found() - and (cc.check_header('sys/event.h', required: false, - args: test_c_args, include_directories: postgres_inc) - or cc.check_header('sys/epoll.h', required: false, - args: test_c_args, include_directories: postgres_inc)) + and (cc.has_header('sys/event.h', + args: test_c_args, include_directories: postgres_inc) + or cc.has_header('sys/epoll.h', + args: test_c_args, include_directories: postgres_inc)) ) if oauth_flow_supported @@ -990,6 +995,12 @@ liburingopt = get_option('liburing') liburing = dependency('liburing', required: liburingopt) if liburing.found() cdata.set('USE_LIBURING', 1) + + if cc.has_function('io_uring_queue_init_mem', + dependencies: liburing, args: test_c_args) + cdata.set('HAVE_LIBURING_QUEUE_INIT_MEM', 1) + endif + endif @@ -1284,7 +1295,7 @@ pyopt = get_option('plpython') python3_dep = not_found_dep if not pyopt.disabled() pm = import('python') - python3_inst = pm.find_installation(python.path(), required: pyopt) + python3_inst = pm.find_installation(python.full_path(), required: pyopt) if python3_inst.found() python3_dep = python3_inst.dependency(embed: true, required: pyopt) # Remove this check after we depend on Meson >= 1.1.0 @@ -1985,10 +1996,7 @@ if cc.links(''' cdata.set('HAVE__BUILTIN_OP_OVERFLOW', 1) endif - -# XXX: The configure.ac check for __cpuid() is broken, we don't copy that -# here. To prevent problems due to two detection methods working, stop -# checking after one. +# Check for __get_cpuid() and __cpuid(). if cc.links(''' #include int main(int arg, char **argv) @@ -3146,13 +3154,13 @@ gen_kwlist_cmd = [ ### if host_system == 'windows' - pg_ico = meson.source_root() / 'src' / 'port' / 'win32.ico' + pg_ico = meson.project_source_root() / 'src' / 'port' / 'win32.ico' win32ver_rc = files('src/port/win32ver.rc') rcgen = find_program('src/tools/rcgen', native: true) rcgen_base_args = [ '--srcdir', '@SOURCE_DIR@', - '--builddir', meson.build_root(), + '--builddir', meson.project_build_root(), '--rcout', '@OUTPUT0@', '--out', '@OUTPUT1@', '--input', '@INPUT@', @@ -3161,11 +3169,11 @@ if host_system == 'windows' if cc.get_argument_syntax() == 'msvc' rc = find_program('rc', required: true) - rcgen_base_args += ['--rc', rc.path()] + rcgen_base_args += ['--rc', rc.full_path()] rcgen_outputs = ['@BASENAME@.rc', '@BASENAME@.res'] else windres = find_program('windres', required: true) - rcgen_base_args += ['--windres', windres.path()] + rcgen_base_args += ['--windres', windres.full_path()] rcgen_outputs = ['@BASENAME@.rc', '@BASENAME@.obj'] endif @@ -3398,7 +3406,7 @@ foreach t1 : configure_files potentially_conflicting_files += meson.current_build_dir() / t endforeach foreach sub, fnames : generated_sources_ac - sub = meson.build_root() / sub + sub = meson.project_build_root() / sub foreach fname : fnames potentially_conflicting_files += sub / fname endforeach @@ -3498,7 +3506,7 @@ run_target('install-test-files', ############################################################### # DESTDIR for the installation we'll run tests in -test_install_destdir = meson.build_root() / 'tmp_install/' +test_install_destdir = meson.project_build_root() / 'tmp_install/' # DESTDIR + prefix appropriately munged if build_system != 'windows' @@ -3541,7 +3549,7 @@ test('install_test_files', is_parallel: false, suite: ['setup']) -test_result_dir = meson.build_root() / 'testrun' +test_result_dir = meson.project_build_root() / 'testrun' # XXX: pg_regress doesn't assign unique ports on windows. To avoid the @@ -3552,12 +3560,12 @@ testport = 40000 test_env = environment() -test_initdb_template = meson.build_root() / 'tmp_install' / 'initdb-template' +test_initdb_template = meson.project_build_root() / 'tmp_install' / 'initdb-template' test_env.set('PG_REGRESS', pg_regress.full_path()) test_env.set('REGRESS_SHLIB', regress_module.full_path()) test_env.set('INITDB_TEMPLATE', test_initdb_template) # for Cluster.pm's portlock logic -test_env.set('top_builddir', meson.build_root()) +test_env.set('top_builddir', meson.project_build_root()) # Add the temporary installation to the library search path on platforms where # that works (everything but windows, basically). On windows everything @@ -3601,26 +3609,20 @@ sys.exit(sp.returncode) # Test Generation ############################################################### -# When using a meson version understanding exclude_suites, define a -# 'tmp_install' test setup (the default) that excludes tests running against a -# pre-existing install and a 'running' setup that conflicts with creation of -# the temporary installation and tap tests (which don't support running -# against a running server). +# Define a 'tmp_install' test setup (the default) that excludes tests +# running against a pre-existing install and a 'running' setup that +# conflicts with creation of the temporary installation and tap tests +# (which don't support running against a running server). running_suites = [] install_suites = [] -if meson.version().version_compare('>=0.57') - runningcheck = true -else - runningcheck = false -endif testwrap = files('src/tools/testwrap') foreach test_dir : tests testwrap_base = [ testwrap, - '--basedir', meson.build_root(), + '--basedir', meson.project_build_root(), '--srcdir', test_dir['sd'], # Some test suites are not run by default but can be run if selected by the # user via variable PG_TEST_EXTRA. Pass configuration time value of @@ -3710,7 +3712,7 @@ foreach test_dir : tests install_suites += test_group # some tests can't support running against running DB - if runningcheck and t.get('runningcheck', true) + if t.get('runningcheck', true) test(test_group_running / kind, python, args: [ @@ -3737,8 +3739,8 @@ foreach test_dir : tests endif test_command = [ - perl.path(), - '-I', meson.source_root() / 'src/test/perl', + perl.full_path(), + '-I', meson.project_source_root() / 'src/test/perl', '-I', test_dir['sd'], ] @@ -3793,13 +3795,11 @@ foreach test_dir : tests endforeach # directories with tests # repeat condition so meson realizes version dependency -if meson.version().version_compare('>=0.57') - add_test_setup('tmp_install', - is_default: true, - exclude_suites: running_suites) - add_test_setup('running', - exclude_suites: ['setup'] + install_suites) -endif +add_test_setup('tmp_install', + is_default: true, + exclude_suites: running_suites) +add_test_setup('running', + exclude_suites: ['setup'] + install_suites) @@ -3856,7 +3856,7 @@ tar_gz = custom_target('tar.gz', '--format', 'tar.gz', '-9', '--prefix', distdir + '/', - '-o', join_paths(meson.build_root(), '@OUTPUT@'), + '-o', join_paths(meson.project_build_root(), '@OUTPUT@'), pg_git_revision], output: distdir + '.tar.gz', ) @@ -3866,11 +3866,11 @@ if bzip2.found() build_always_stale: true, command: [git, '-C', '@SOURCE_ROOT@', '-c', 'core.autocrlf=false', - '-c', 'tar.tar.bz2.command="@0@" -c'.format(bzip2.path()), + '-c', 'tar.tar.bz2.command="@0@" -c'.format(bzip2.full_path()), 'archive', '--format', 'tar.bz2', '--prefix', distdir + '/', - '-o', join_paths(meson.build_root(), '@OUTPUT@'), + '-o', join_paths(meson.project_build_root(), '@OUTPUT@'), pg_git_revision], output: distdir + '.tar.bz2', ) @@ -3887,10 +3887,7 @@ alias_target('pgdist', [tar_gz, tar_bz2]) # But not if we are in a subproject, in case the parent project wants to # create a dist using the standard Meson command. if not meson.is_subproject() - # We can only pass the identifier perl here when we depend on >= 0.55 - if meson.version().version_compare('>=0.55') - meson.add_dist_script(perl, '-e', 'exit 1') - endif + meson.add_dist_script(perl, '-e', 'exit 1') endif @@ -3899,106 +3896,102 @@ endif # The End, The End, My Friend ############################################################### -if meson.version().version_compare('>=0.57') +summary( + { + 'data block size': '@0@ kB'.format(cdata.get('BLCKSZ') / 1024), + 'WAL block size': '@0@ kB'.format(cdata.get('XLOG_BLCKSZ') / 1024), + 'segment size': get_option('segsize_blocks') != 0 ? + '@0@ blocks'.format(cdata.get('RELSEG_SIZE')) : + '@0@ GB'.format(get_option('segsize')), + }, + section: 'Data layout', +) - summary( - { - 'data block size': '@0@ kB'.format(cdata.get('BLCKSZ') / 1024), - 'WAL block size': '@0@ kB'.format(cdata.get('XLOG_BLCKSZ') / 1024), - 'segment size': get_option('segsize_blocks') != 0 ? - '@0@ blocks'.format(cdata.get('RELSEG_SIZE')) : - '@0@ GB'.format(get_option('segsize')), - }, - section: 'Data layout', - ) +summary( + { + 'host system': '@0@ @1@'.format(host_system, host_cpu), + 'build system': '@0@ @1@'.format(build_machine.system(), + build_machine.cpu_family()), + }, + section: 'System', +) - summary( - { - 'host system': '@0@ @1@'.format(host_system, host_cpu), - 'build system': '@0@ @1@'.format(build_machine.system(), - build_machine.cpu_family()), - }, - section: 'System', - ) +summary( + { + 'linker': '@0@'.format(cc.get_linker_id()), + 'C compiler': '@0@ @1@'.format(cc.get_id(), cc.version()), + }, + section: 'Compiler', +) +summary( + { + 'CPP FLAGS': ' '.join(cppflags), + 'C FLAGS, functional': ' '.join(cflags), + 'C FLAGS, warnings': ' '.join(cflags_warn), + 'C FLAGS, modules': ' '.join(cflags_mod), + 'C FLAGS, user specified': ' '.join(get_option('c_args')), + 'LD FLAGS': ' '.join(ldflags + get_option('c_link_args')), + }, + section: 'Compiler Flags', +) + +if llvm.found() summary( { - 'linker': '@0@'.format(cc.get_linker_id()), - 'C compiler': '@0@ @1@'.format(cc.get_id(), cc.version()), + 'C++ compiler': '@0@ @1@'.format(cpp.get_id(), cpp.version()), }, section: 'Compiler', ) summary( { - 'CPP FLAGS': ' '.join(cppflags), - 'C FLAGS, functional': ' '.join(cflags), - 'C FLAGS, warnings': ' '.join(cflags_warn), - 'C FLAGS, modules': ' '.join(cflags_mod), - 'C FLAGS, user specified': ' '.join(get_option('c_args')), - 'LD FLAGS': ' '.join(ldflags + get_option('c_link_args')), + 'C++ FLAGS, functional': ' '.join(cxxflags), + 'C++ FLAGS, warnings': ' '.join(cxxflags_warn), + 'C++ FLAGS, user specified': ' '.join(get_option('cpp_args')), }, section: 'Compiler Flags', ) +endif - if llvm.found() - summary( - { - 'C++ compiler': '@0@ @1@'.format(cpp.get_id(), cpp.version()), - }, - section: 'Compiler', - ) - - summary( - { - 'C++ FLAGS, functional': ' '.join(cxxflags), - 'C++ FLAGS, warnings': ' '.join(cxxflags_warn), - 'C++ FLAGS, user specified': ' '.join(get_option('cpp_args')), - }, - section: 'Compiler Flags', - ) - endif - - summary( - { - 'bison': '@0@ @1@'.format(bison.full_path(), bison_version), - 'dtrace': dtrace, - 'flex': '@0@ @1@'.format(flex.full_path(), flex_version), - }, - section: 'Programs', - ) - - summary( - { - 'bonjour': bonjour, - 'bsd_auth': bsd_auth, - 'docs': docs_dep, - 'docs_pdf': docs_pdf_dep, - 'gss': gssapi, - 'icu': icu, - 'ldap': ldap, - 'libcurl': libcurl, - 'libnuma': libnuma, - 'liburing': liburing, - 'libxml': libxml, - 'libxslt': libxslt, - 'llvm': llvm, - 'lz4': lz4, - 'nls': libintl, - 'openssl': ssl, - 'pam': pam, - 'plperl': [perl_dep, perlversion], - 'plpython': python3_dep, - 'pltcl': tcl_dep, - 'readline': readline, - 'selinux': selinux, - 'systemd': systemd, - 'uuid': uuid, - 'zlib': zlib, - 'zstd': zstd, - }, - section: 'External libraries', - list_sep: ' ', - ) +summary( + { + 'bison': '@0@ @1@'.format(bison.full_path(), bison_version), + 'dtrace': dtrace, + 'flex': '@0@ @1@'.format(flex.full_path(), flex_version), + }, + section: 'Programs', +) -endif +summary( + { + 'bonjour': bonjour, + 'bsd_auth': bsd_auth, + 'docs': docs_dep, + 'docs_pdf': docs_pdf_dep, + 'gss': gssapi, + 'icu': icu, + 'ldap': ldap, + 'libcurl': libcurl, + 'libnuma': libnuma, + 'liburing': liburing, + 'libxml': libxml, + 'libxslt': libxslt, + 'llvm': llvm, + 'lz4': lz4, + 'nls': libintl, + 'openssl': ssl, + 'pam': pam, + 'plperl': [perl_dep, perlversion], + 'plpython': python3_dep, + 'pltcl': tcl_dep, + 'readline': readline, + 'selinux': selinux, + 'systemd': systemd, + 'uuid': uuid, + 'zlib': zlib, + 'zstd': zstd, + }, + section: 'External libraries', + list_sep: ' ', +) diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 04952b533ded9..8b1b357beaa04 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -254,7 +254,7 @@ CPP = @CPP@ CPPFLAGS = @CPPFLAGS@ PG_SYSROOT = @PG_SYSROOT@ -override CPPFLAGS := $(ICU_CFLAGS) $(LIBNUMA_CFLAGS) $(LIBURING_CFLAGS) $(CPPFLAGS) +override CPPFLAGS += $(ICU_CFLAGS) $(LIBNUMA_CFLAGS) $(LIBURING_CFLAGS) ifdef PGXS override CPPFLAGS := -I$(includedir_server) -I$(includedir_internal) $(CPPFLAGS) diff --git a/src/backend/access/common/toast_compression.c b/src/backend/access/common/toast_compression.c index 21f2f4af97e3f..926f1e4008abe 100644 --- a/src/backend/access/common/toast_compression.c +++ b/src/backend/access/common/toast_compression.c @@ -25,11 +25,11 @@ /* GUC */ int default_toast_compression = TOAST_PGLZ_COMPRESSION; -#define NO_LZ4_SUPPORT() \ +#define NO_COMPRESSION_SUPPORT(method) \ ereport(ERROR, \ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \ - errmsg("compression method lz4 not supported"), \ - errdetail("This functionality requires the server to be built with lz4 support."))) + errmsg("compression method %s not supported", method), \ + errdetail("This functionality requires the server to be built with %s support.", method))) /* * Compress a varlena using PGLZ. @@ -139,7 +139,7 @@ struct varlena * lz4_compress_datum(const struct varlena *value) { #ifndef USE_LZ4 - NO_LZ4_SUPPORT(); + NO_COMPRESSION_SUPPORT("lz4"); return NULL; /* keep compiler quiet */ #else int32 valsize; @@ -182,7 +182,7 @@ struct varlena * lz4_decompress_datum(const struct varlena *value) { #ifndef USE_LZ4 - NO_LZ4_SUPPORT(); + NO_COMPRESSION_SUPPORT("lz4"); return NULL; /* keep compiler quiet */ #else int32 rawsize; @@ -215,7 +215,7 @@ struct varlena * lz4_decompress_datum_slice(const struct varlena *value, int32 slicelength) { #ifndef USE_LZ4 - NO_LZ4_SUPPORT(); + NO_COMPRESSION_SUPPORT("lz4"); return NULL; /* keep compiler quiet */ #else int32 rawsize; @@ -289,7 +289,7 @@ CompressionNameToMethod(const char *compression) else if (strcmp(compression, "lz4") == 0) { #ifndef USE_LZ4 - NO_LZ4_SUPPORT(); + NO_COMPRESSION_SUPPORT("lz4"); #endif return TOAST_LZ4_COMPRESSION; } diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 4111a8996b5a1..14036c27e878a 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -423,7 +423,7 @@ typedef struct LVSavedErrInfo /* non-export function prototypes */ static void lazy_scan_heap(LVRelState *vacrel); static void heap_vacuum_eager_scan_setup(LVRelState *vacrel, - VacuumParams *params); + const VacuumParams params); static BlockNumber heap_vac_scan_next_block(ReadStream *stream, void *callback_private_data, void *per_buffer_data); @@ -431,7 +431,7 @@ static void find_next_unskippable_block(LVRelState *vacrel, bool *skipsallvis); static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, Page page, bool sharelock, Buffer vmbuffer); -static void lazy_scan_prune(LVRelState *vacrel, Buffer buf, +static int lazy_scan_prune(LVRelState *vacrel, Buffer buf, BlockNumber blkno, Page page, Buffer vmbuffer, bool all_visible_according_to_vm, bool *has_lpdead_items, bool *vm_page_frozen); @@ -485,7 +485,7 @@ static void restore_vacuum_error_info(LVRelState *vacrel, * vacuum options or for relfrozenxid/relminmxid advancement. */ static void -heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params) +heap_vacuum_eager_scan_setup(LVRelState *vacrel, const VacuumParams params) { uint32 randseed; BlockNumber allvisible; @@ -504,7 +504,7 @@ heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params) vacrel->eager_scan_remaining_successes = 0; /* If eager scanning is explicitly disabled, just return. */ - if (params->max_eager_freeze_failure_rate == 0) + if (params.max_eager_freeze_failure_rate == 0) return; /* @@ -581,11 +581,11 @@ heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params) vacrel->next_eager_scan_region_start = randseed % EAGER_SCAN_REGION_SIZE; - Assert(params->max_eager_freeze_failure_rate > 0 && - params->max_eager_freeze_failure_rate <= 1); + Assert(params.max_eager_freeze_failure_rate > 0 && + params.max_eager_freeze_failure_rate <= 1); vacrel->eager_scan_max_fails_per_region = - params->max_eager_freeze_failure_rate * + params.max_eager_freeze_failure_rate * EAGER_SCAN_REGION_SIZE; /* @@ -612,7 +612,7 @@ heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params) * and locked the relation. */ void -heap_vacuum_rel(Relation rel, VacuumParams *params, +heap_vacuum_rel(Relation rel, const VacuumParams params, BufferAccessStrategy bstrategy) { LVRelState *vacrel; @@ -634,9 +634,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, ErrorContextCallback errcallback; char **indnames = NULL; - verbose = (params->options & VACOPT_VERBOSE) != 0; + verbose = (params.options & VACOPT_VERBOSE) != 0; instrument = (verbose || (AmAutoVacuumWorkerProcess() && - params->log_min_duration >= 0)); + params.log_min_duration >= 0)); if (instrument) { pg_rusage_init(&ru0); @@ -699,9 +699,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * The truncate param allows user to avoid attempting relation truncation, * though it can't force truncation to happen. */ - Assert(params->index_cleanup != VACOPTVALUE_UNSPECIFIED); - Assert(params->truncate != VACOPTVALUE_UNSPECIFIED && - params->truncate != VACOPTVALUE_AUTO); + Assert(params.index_cleanup != VACOPTVALUE_UNSPECIFIED); + Assert(params.truncate != VACOPTVALUE_UNSPECIFIED && + params.truncate != VACOPTVALUE_AUTO); /* * While VacuumFailSafeActive is reset to false before calling this, we @@ -711,14 +711,14 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, vacrel->consider_bypass_optimization = true; vacrel->do_index_vacuuming = true; vacrel->do_index_cleanup = true; - vacrel->do_rel_truncate = (params->truncate != VACOPTVALUE_DISABLED); - if (params->index_cleanup == VACOPTVALUE_DISABLED) + vacrel->do_rel_truncate = (params.truncate != VACOPTVALUE_DISABLED); + if (params.index_cleanup == VACOPTVALUE_DISABLED) { /* Force disable index vacuuming up-front */ vacrel->do_index_vacuuming = false; vacrel->do_index_cleanup = false; } - else if (params->index_cleanup == VACOPTVALUE_ENABLED) + else if (params.index_cleanup == VACOPTVALUE_ENABLED) { /* Force index vacuuming. Note that failsafe can still bypass. */ vacrel->consider_bypass_optimization = false; @@ -726,7 +726,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, else { /* Default/auto, make all decisions dynamically */ - Assert(params->index_cleanup == VACOPTVALUE_AUTO); + Assert(params.index_cleanup == VACOPTVALUE_AUTO); } /* Initialize page counters explicitly (be tidy) */ @@ -789,7 +789,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, */ vacrel->skippedallvis = false; skipwithvm = true; - if (params->options & VACOPT_DISABLE_PAGE_SKIPPING) + if (params.options & VACOPT_DISABLE_PAGE_SKIPPING) { /* * Force aggressive mode, and disable skipping blocks using the @@ -830,7 +830,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * is already dangerously old.) */ lazy_check_wraparound_failsafe(vacrel); - dead_items_alloc(vacrel, params->nworkers); + dead_items_alloc(vacrel, params.nworkers); /* * Call lazy_scan_heap to perform all required heap pruning, index @@ -947,9 +947,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, { TimestampTz endtime = GetCurrentTimestamp(); - if (verbose || params->log_min_duration == 0 || + if (verbose || params.log_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, - params->log_min_duration)) + params.log_min_duration)) { long secs_dur; int usecs_dur; @@ -984,10 +984,10 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * Aggressiveness already reported earlier, in dedicated * VACUUM VERBOSE ereport */ - Assert(!params->is_wraparound); + Assert(!params.is_wraparound); msgfmt = _("finished vacuuming \"%s.%s.%s\": index scans: %d\n"); } - else if (params->is_wraparound) + else if (params.is_wraparound) { /* * While it's possible for a VACUUM to be both is_wraparound @@ -1245,6 +1245,7 @@ lazy_scan_heap(LVRelState *vacrel) Buffer buf; Page page; uint8 blk_info = 0; + int ndeleted = 0; bool has_lpdead_items; void *per_buffer_data = NULL; bool vm_page_frozen = false; @@ -1387,10 +1388,10 @@ lazy_scan_heap(LVRelState *vacrel) * line pointers previously marked LP_DEAD. */ if (got_cleanup_lock) - lazy_scan_prune(vacrel, buf, blkno, page, - vmbuffer, - blk_info & VAC_BLK_ALL_VISIBLE_ACCORDING_TO_VM, - &has_lpdead_items, &vm_page_frozen); + ndeleted = lazy_scan_prune(vacrel, buf, blkno, page, + vmbuffer, + blk_info & VAC_BLK_ALL_VISIBLE_ACCORDING_TO_VM, + &has_lpdead_items, &vm_page_frozen); /* * Count an eagerly scanned page as a failure or a success. @@ -1481,7 +1482,7 @@ lazy_scan_heap(LVRelState *vacrel) * table has indexes. There will only be newly-freed space if we * held the cleanup lock and lazy_scan_prune() was called. */ - if (got_cleanup_lock && vacrel->nindexes == 0 && has_lpdead_items && + if (got_cleanup_lock && vacrel->nindexes == 0 && ndeleted > 0 && blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES) { FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, @@ -1936,8 +1937,10 @@ cmpOffsetNumbers(const void *a, const void *b) * *vm_page_frozen is set to true if the page is newly set all-frozen in the * VM. The caller currently only uses this for determining whether an eagerly * scanned page was successfully set all-frozen. + * + * Returns the number of tuples deleted from the page during HOT pruning. */ -static void +static int lazy_scan_prune(LVRelState *vacrel, Buffer buf, BlockNumber blkno, @@ -2208,6 +2211,8 @@ lazy_scan_prune(LVRelState *vacrel, *vm_page_frozen = true; } } + + return presult.ndeleted; } /* diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 745a04ef26e29..8f918e00af7ed 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -364,7 +364,7 @@ visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf) { *vmbuf = vm_readbuf(rel, mapBlock, false); if (!BufferIsValid(*vmbuf)) - return false; + return (uint8) 0; } map = PageGetContents(BufferGetPage(*vmbuf)); diff --git a/src/backend/access/index/amapi.c b/src/backend/access/index/amapi.c index f0f4f974bcedb..60684c5342279 100644 --- a/src/backend/access/index/amapi.c +++ b/src/backend/access/index/amapi.c @@ -42,6 +42,19 @@ GetIndexAmRoutine(Oid amhandler) elog(ERROR, "index access method handler function %u did not return an IndexAmRoutine struct", amhandler); + /* Assert that all required callbacks are present. */ + Assert(routine->ambuild != NULL); + Assert(routine->ambuildempty != NULL); + Assert(routine->aminsert != NULL); + Assert(routine->ambulkdelete != NULL); + Assert(routine->amvacuumcleanup != NULL); + Assert(routine->amcostestimate != NULL); + Assert(routine->amoptions != NULL); + Assert(routine->amvalidate != NULL); + Assert(routine->ambeginscan != NULL); + Assert(routine->amrescan != NULL); + Assert(routine->amendscan != NULL); + return routine; } diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c index a136e4bbfdfb5..21c519cd108ed 100644 --- a/src/backend/access/nbtree/nbtpreprocesskeys.c +++ b/src/backend/access/nbtree/nbtpreprocesskeys.c @@ -16,6 +16,7 @@ #include "postgres.h" #include "access/nbtree.h" +#include "common/int.h" #include "lib/qunique.h" #include "utils/array.h" #include "utils/lsyscache.h" @@ -56,6 +57,8 @@ static void _bt_skiparray_strat_decrement(IndexScanDesc scan, ScanKey arraysk, BTArrayKeyInfo *array); static void _bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk, BTArrayKeyInfo *array); +static void _bt_unmark_keys(IndexScanDesc scan, int *keyDataMap); +static int _bt_reorder_array_cmp(const void *a, const void *b); static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys); static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap); static int _bt_num_array_keys(IndexScanDesc scan, Oid *skip_eq_ops_out, @@ -96,7 +99,7 @@ static int _bt_compare_array_elements(const void *a, const void *b, void *arg); * incomplete sets of cross-type operators, we may fail to detect redundant * or contradictory keys, but we can survive that.) * - * The output keys must be sorted by index attribute. Presently we expect + * Required output keys are sorted by index attribute. Presently we expect * (but verify) that the input keys are already so sorted --- this is done * by match_clauses_to_index() in indxpath.c. Some reordering of the keys * within each attribute may be done as a byproduct of the processing here. @@ -127,29 +130,36 @@ static int _bt_compare_array_elements(const void *a, const void *b, void *arg); * This has the potential to be much more efficient than a full index scan * (though it behaves like a full scan when there's many distinct "x" values). * - * If possible, redundant keys are eliminated: we keep only the tightest + * Typically, redundant keys are eliminated: we keep only the tightest * >/>= bound and the tightest />= or both - * 4::int AND x > 10::bigint", and we are unable to determine - * which key is more restrictive for lack of a suitable cross-type operator. - * _bt_first will arbitrarily pick one of the keys to do the initial - * positioning with. If it picks x > 4, then the x > 10 condition will fail - * until we reach index entries > 10; but we can't stop the scan just because - * x > 10 is failing. On the other hand, if we are scanning backwards, then - * failure of either key is indeed enough to stop the scan. (In general, when - * inequality keys are present, the initial-positioning code only promises to - * position before the first possible match, not exactly at the first match, - * for a forward scan; or after the last match for a backward scan.) + * we cannot eliminate either key. + * + * When all redundant keys could not be eliminated, we'll output a key array + * that can more or less be treated as if it had no redundant keys. Suppose + * we have "x > 4::int AND x > 10::bigint AND x < 70", and we are unable to + * determine which > key is more restrictive for lack of a suitable cross-type + * operator. We'll arbitrarily pick one of the > keys; the other > key won't + * be marked required. Obviously, the scan will be less efficient if we + * choose x > 4 over x > 10 -- but it can still largely proceed as if there + * was only a single > condition. "x > 10" will be placed at the end of the + * so->keyData[] output array. It'll always be evaluated last, after the keys + * that could be marked required in the usual way (after "x > 4 AND x < 70"). + * This can sometimes result in so->keyData[] keys that aren't even in index + * attribute order (if the qual involves multiple attributes). The scan's + * required keys will still be in attribute order, though, so it can't matter. + * + * This scheme ensures that _bt_first always uses the same set of keys at the + * start of a forwards scan as those _bt_checkkeys uses to determine when to + * end a similar backwards scan (and vice-versa). _bt_advance_array_keys + * depends on this: it expects to be able to reliably predict what the next + * _bt_first call will do by testing whether _bt_checkkeys' routines report + * that the final tuple on the page is past the end of matches for the scan's + * keys with the scan direction flipped. If it is (if continuescan=false), + * then it follows that calling _bt_first will, at a minimum, relocate the + * scan to the very next leaf page (in the current scan direction). * * As a byproduct of this work, we can detect contradictory quals such * as "x = 1 AND x > 2". If we see that, we return so->qual_ok = false, @@ -188,7 +198,8 @@ _bt_preprocess_keys(IndexScanDesc scan) int numberOfEqualCols; ScanKey inkeys; BTScanKeyPreproc xform[BTMaxStrategyNumber]; - bool test_result; + bool test_result, + redundant_key_kept = false; AttrNumber attno; ScanKey arrayKeyData; int *keyDataMap = NULL; @@ -388,7 +399,8 @@ _bt_preprocess_keys(IndexScanDesc scan) xform[j].inkey = NULL; xform[j].inkeyi = -1; } - /* else, cannot determine redundancy, keep both keys */ + else + redundant_key_kept = true; } /* track number of attrs for which we have "=" keys */ numberOfEqualCols++; @@ -409,6 +421,8 @@ _bt_preprocess_keys(IndexScanDesc scan) else xform[BTLessStrategyNumber - 1].inkey = NULL; } + else + redundant_key_kept = true; } /* try to keep only one of >, >= */ @@ -426,6 +440,8 @@ _bt_preprocess_keys(IndexScanDesc scan) else xform[BTGreaterStrategyNumber - 1].inkey = NULL; } + else + redundant_key_kept = true; } /* @@ -466,25 +482,6 @@ _bt_preprocess_keys(IndexScanDesc scan) /* check strategy this key's operator corresponds to */ j = inkey->sk_strategy - 1; - /* if row comparison, push it directly to the output array */ - if (inkey->sk_flags & SK_ROW_HEADER) - { - ScanKey outkey = &so->keyData[new_numberOfKeys++]; - - memcpy(outkey, inkey, sizeof(ScanKeyData)); - if (arrayKeyData) - keyDataMap[new_numberOfKeys - 1] = i; - if (numberOfEqualCols == attno - 1) - _bt_mark_scankey_required(outkey); - - /* - * We don't support RowCompare using equality; such a qual would - * mess up the numberOfEqualCols tracking. - */ - Assert(j != (BTEqualStrategyNumber - 1)); - continue; - } - if (inkey->sk_strategy == BTEqualStrategyNumber && (inkey->sk_flags & SK_SEARCHARRAY)) { @@ -593,9 +590,8 @@ _bt_preprocess_keys(IndexScanDesc scan) * the new scan key. * * Note: We do things this way around so that our arrays are - * always in the same order as their corresponding scan keys, - * even with incomplete opfamilies. _bt_advance_array_keys - * depends on this. + * always in the same order as their corresponding scan keys. + * _bt_preprocess_array_keys_final expects this. */ ScanKey outkey = &so->keyData[new_numberOfKeys++]; @@ -607,6 +603,7 @@ _bt_preprocess_keys(IndexScanDesc scan) xform[j].inkey = inkey; xform[j].inkeyi = i; xform[j].arrayidx = arrayidx; + redundant_key_kept = true; } } } @@ -622,6 +619,15 @@ _bt_preprocess_keys(IndexScanDesc scan) if (arrayKeyData) _bt_preprocess_array_keys_final(scan, keyDataMap); + /* + * If there are remaining redundant inequality keys, we must make sure + * that each index attribute has no more than one required >/>= key, and + * no more than one required qual_ok) + _bt_unmark_keys(scan, keyDataMap); + /* Could pfree arrayKeyData/keyDataMap now, but not worth the cycles */ } @@ -746,9 +752,12 @@ _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption) * * Depending on the operator type, the key may be required for both scan * directions or just one. Also, if the key is a row comparison header, - * we have to mark its first subsidiary ScanKey as required. (Subsequent - * subsidiary ScanKeys are normally for lower-order columns, and thus - * cannot be required, since they're after the first non-equality scankey.) + * we have to mark the appropriate subsidiary ScanKeys as required. In such + * cases, the first subsidiary key is required, but subsequent ones are + * required only as long as they correspond to successive index columns and + * match the leading column as to sort direction. Otherwise the row + * comparison ordering is different from the index ordering and so we can't + * stop the scan on the basis of those lower-order columns. * * Note: when we set required-key flag bits in a subsidiary scankey, we are * scribbling on a data structure belonging to the index AM's caller, not on @@ -786,12 +795,25 @@ _bt_mark_scankey_required(ScanKey skey) if (skey->sk_flags & SK_ROW_HEADER) { ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); + AttrNumber attno = skey->sk_attno; /* First subkey should be same column/operator as the header */ - Assert(subkey->sk_flags & SK_ROW_MEMBER); - Assert(subkey->sk_attno == skey->sk_attno); + Assert(subkey->sk_attno == attno); Assert(subkey->sk_strategy == skey->sk_strategy); - subkey->sk_flags |= addflags; + + for (;;) + { + Assert(subkey->sk_flags & SK_ROW_MEMBER); + if (subkey->sk_attno != attno) + break; /* non-adjacent key, so not required */ + if (subkey->sk_strategy != skey->sk_strategy) + break; /* wrong direction, so not required */ + subkey->sk_flags |= addflags; + if (subkey->sk_flags & SK_ROW_END) + break; + subkey++; + attno++; + } } } @@ -847,8 +869,7 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, cmp_op; StrategyNumber strat; - Assert(!((leftarg->sk_flags | rightarg->sk_flags) & - (SK_ROW_HEADER | SK_ROW_MEMBER))); + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_ROW_MEMBER)); /* * First, deal with cases where one or both args are NULL. This should @@ -924,6 +945,16 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, return true; } + /* + * We don't yet know how to determine redundancy when it involves a row + * compare key (barring simple cases involving IS NULL/IS NOT NULL) + */ + if ((leftarg->sk_flags | rightarg->sk_flags) & SK_ROW_HEADER) + { + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_BT_SKIP)); + return false; + } + /* * If either leftarg or rightarg are equality-type array scankeys, we need * specialized handling (since by now we know that IS NULL wasn't used) @@ -1467,6 +1498,283 @@ _bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk, } } +/* + * _bt_unmark_keys() -- make superfluous required keys nonrequired after all + * + * When _bt_preprocess_keys fails to eliminate one or more redundant keys, it + * calls here to make sure that no index attribute has more than one > or >= + * key marked required, and no more than one required < or <= key. Attributes + * with = keys will always get one = key as their required key. All other + * keys that were initially marked required get "unmarked" here. That way, + * _bt_first and _bt_checkkeys will reliably agree on which keys to use to + * start and/or to end the scan. + * + * We also relocate keys that become/started out nonrequired to the end of + * so->keyData[]. That way, _bt_first and _bt_checkkeys cannot fail to reach + * a required key due to some earlier nonrequired key getting in the way. + * + * Only call here when _bt_compare_scankey_args returned false at least once + * (otherwise, calling here will just waste cycles). + */ +static void +_bt_unmark_keys(IndexScanDesc scan, int *keyDataMap) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + AttrNumber attno; + bool *unmarkikey; + int nunmark, + nunmarked, + nkept, + firsti; + ScanKey keepKeys, + unmarkKeys; + FmgrInfo *keepOrderProcs = NULL, + *unmarkOrderProcs = NULL; + bool haveReqEquals, + haveReqForward, + haveReqBackward; + + /* + * Do an initial pass over so->keyData[] that determines which keys to + * keep as required. We expect so->keyData[] to still be in attribute + * order when we're called (though we don't expect any particular order + * among each attribute's keys). + * + * When both equality and inequality keys remain on a single attribute, we + * *must* make sure that exactly one of the equalities remains required. + * Any requiredness markings that we might leave on later keys/attributes + * are predicated on there being required = keys on all prior columns. + */ + unmarkikey = palloc0(so->numberOfKeys * sizeof(bool)); + nunmark = 0; + + /* Set things up for first key's attribute */ + attno = so->keyData[0].sk_attno; + firsti = 0; + haveReqEquals = false; + haveReqForward = false; + haveReqBackward = false; + for (int i = 0; i < so->numberOfKeys; i++) + { + ScanKey origkey = &so->keyData[i]; + + if (origkey->sk_attno != attno) + { + /* Reset for next attribute */ + attno = origkey->sk_attno; + firsti = i; + + haveReqEquals = false; + haveReqForward = false; + haveReqBackward = false; + } + + /* Equalities get priority over inequalities */ + if (haveReqEquals) + { + /* + * We already found the first "=" key for this attribute. We've + * already decided that all its other keys will be unmarked. + */ + Assert(!(origkey->sk_flags & SK_SEARCHNULL)); + unmarkikey[i] = true; + nunmark++; + continue; + } + else if ((origkey->sk_flags & SK_BT_REQFWD) && + (origkey->sk_flags & SK_BT_REQBKWD)) + { + /* + * Found the first "=" key for attno. All other attno keys will + * be unmarked. + */ + Assert(origkey->sk_strategy == BTEqualStrategyNumber); + + haveReqEquals = true; + for (int j = firsti; j < i; j++) + { + /* Unmark any prior inequality keys on attno after all */ + if (!unmarkikey[j]) + { + unmarkikey[j] = true; + nunmark++; + } + } + continue; + } + + /* Deal with inequalities next */ + if ((origkey->sk_flags & SK_BT_REQFWD) && !haveReqForward) + { + haveReqForward = true; + continue; + } + else if ((origkey->sk_flags & SK_BT_REQBKWD) && !haveReqBackward) + { + haveReqBackward = true; + continue; + } + + /* + * We have either a redundant inequality key that will be unmarked, or + * we have a key that wasn't marked required in the first place + */ + unmarkikey[i] = true; + nunmark++; + } + + /* Should only be called when _bt_compare_scankey_args reported failure */ + Assert(nunmark > 0); + + /* + * Next, allocate temp arrays: one for required keys that'll remain + * required, the other for all remaining keys + */ + unmarkKeys = palloc(nunmark * sizeof(ScanKeyData)); + keepKeys = palloc((so->numberOfKeys - nunmark) * sizeof(ScanKeyData)); + nunmarked = 0; + nkept = 0; + if (so->numArrayKeys) + { + unmarkOrderProcs = palloc(nunmark * sizeof(FmgrInfo)); + keepOrderProcs = palloc((so->numberOfKeys - nunmark) * sizeof(FmgrInfo)); + } + + /* + * Next, copy the contents of so->keyData[] into the appropriate temp + * array. + * + * Scans with = array keys need us to maintain invariants around the order + * of so->orderProcs[] and so->arrayKeys[] relative to so->keyData[]. See + * _bt_preprocess_array_keys_final for a full explanation. + */ + for (int i = 0; i < so->numberOfKeys; i++) + { + ScanKey origkey = &so->keyData[i]; + ScanKey unmark; + + if (!unmarkikey[i]) + { + /* + * Key gets to keep its original requiredness markings. + * + * Key will stay in its original position, unless we're going to + * unmark an earlier key (in which case this key gets moved back). + */ + memcpy(keepKeys + nkept, origkey, sizeof(ScanKeyData)); + + if (so->numArrayKeys) + { + keyDataMap[i] = nkept; + memcpy(keepOrderProcs + nkept, &so->orderProcs[i], + sizeof(FmgrInfo)); + } + + nkept++; + continue; + } + + /* + * Key will be unmarked as needed, and moved to the end of the array, + * next to other keys that will become (or always were) nonrequired + */ + unmark = unmarkKeys + nunmarked; + memcpy(unmark, origkey, sizeof(ScanKeyData)); + + if (so->numArrayKeys) + { + keyDataMap[i] = (so->numberOfKeys - nunmark) + nunmarked; + memcpy(&unmarkOrderProcs[nunmarked], &so->orderProcs[i], + sizeof(FmgrInfo)); + } + + /* + * Preprocessing only generates skip arrays when it knows that they'll + * be the only required = key on the attr. We'll never unmark them. + */ + Assert(!(unmark->sk_flags & SK_BT_SKIP)); + + /* + * Also shouldn't have to unmark an IS NULL or an IS NOT NULL key. + * They aren't cross-type, so an incomplete opfamily can't matter. + */ + Assert(!(unmark->sk_flags & SK_ISNULL) || + !(unmark->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))); + + /* Clear requiredness flags on redundant key (and on any subkeys) */ + unmark->sk_flags &= ~(SK_BT_REQFWD | SK_BT_REQBKWD); + if (unmark->sk_flags & SK_ROW_HEADER) + { + ScanKey subkey = (ScanKey) DatumGetPointer(unmark->sk_argument); + + Assert(subkey->sk_strategy == unmark->sk_strategy); + for (;;) + { + Assert(subkey->sk_flags & SK_ROW_MEMBER); + subkey->sk_flags &= ~(SK_BT_REQFWD | SK_BT_REQBKWD); + if (subkey->sk_flags & SK_ROW_END) + break; + subkey++; + } + } + + nunmarked++; + } + + /* Copy both temp arrays back into so->keyData[] to reorder */ + Assert(nkept == so->numberOfKeys - nunmark); + Assert(nunmarked == nunmark); + memcpy(so->keyData, keepKeys, sizeof(ScanKeyData) * nkept); + memcpy(so->keyData + nkept, unmarkKeys, sizeof(ScanKeyData) * nunmarked); + + /* Done with temp arrays */ + pfree(unmarkikey); + pfree(keepKeys); + pfree(unmarkKeys); + + /* + * Now copy so->orderProcs[] temp entries needed by scans with = array + * keys back (just like with the so->keyData[] temp arrays) + */ + if (so->numArrayKeys) + { + memcpy(so->orderProcs, keepOrderProcs, sizeof(FmgrInfo) * nkept); + memcpy(so->orderProcs + nkept, unmarkOrderProcs, + sizeof(FmgrInfo) * nunmarked); + + /* Also fix-up array->scan_key references */ + for (int arridx = 0; arridx < so->numArrayKeys; arridx++) + { + BTArrayKeyInfo *array = &so->arrayKeys[arridx]; + + array->scan_key = keyDataMap[array->scan_key]; + } + + /* + * Sort so->arrayKeys[] based on its new BTArrayKeyInfo.scan_key + * offsets, so that its order matches so->keyData[] order as expected + */ + qsort(so->arrayKeys, so->numArrayKeys, sizeof(BTArrayKeyInfo), + _bt_reorder_array_cmp); + + /* Done with temp arrays */ + pfree(unmarkOrderProcs); + pfree(keepOrderProcs); + } +} + +/* + * qsort comparator for reordering so->arrayKeys[] BTArrayKeyInfo entries + */ +static int +_bt_reorder_array_cmp(const void *a, const void *b) +{ + BTArrayKeyInfo *arraya = (BTArrayKeyInfo *) a; + BTArrayKeyInfo *arrayb = (BTArrayKeyInfo *) b; + + return pg_cmp_s32(arraya->scan_key, arrayb->scan_key); +} + /* * _bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys * diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 36544ecfd5878..d69798795b43b 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -892,9 +892,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) OffsetNumber offnum; BTScanInsertData inskey; ScanKey startKeys[INDEX_MAX_KEYS]; - ScanKeyData notnullkeys[INDEX_MAX_KEYS]; + ScanKeyData notnullkey; int keysz = 0; - StrategyNumber strat_total; + StrategyNumber strat_total = InvalidStrategy; BlockNumber blkno = InvalidBlockNumber, lastcurrblkno; @@ -960,46 +960,51 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) /*---------- * Examine the scan keys to discover where we need to start the scan. + * The selected scan keys (at most one per index column) are remembered by + * storing their addresses into the local startKeys[] array. The final + * startKeys[] entry's strategy is set in strat_total. (Actually, there + * are a couple of cases where we force a less/more restrictive strategy.) * - * We want to identify the keys that can be used as starting boundaries; - * these are =, >, or >= keys for a forward scan or =, <, <= keys for - * a backwards scan. We can use keys for multiple attributes so long as - * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept - * a > or < boundary or find an attribute with no boundary (which can be - * thought of as the same as "> -infinity"), we can't use keys for any - * attributes to its right, because it would break our simplistic notion - * of what initial positioning strategy to use. + * We must use the key that was marked required (in the direction opposite + * our own scan's) during preprocessing. Each index attribute can only + * have one such required key. In general, the keys that we use to find + * an initial position when scanning forwards are the same keys that end + * the scan on the leaf level when scanning backwards (and vice-versa). * * When the scan keys include cross-type operators, _bt_preprocess_keys - * may not be able to eliminate redundant keys; in such cases we will - * arbitrarily pick a usable one for each attribute. This is correct - * but possibly not optimal behavior. (For example, with keys like - * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when - * x=5 would be more efficient.) Since the situation only arises given - * a poorly-worded query plus an incomplete opfamily, live with it. + * may not be able to eliminate redundant keys; in such cases it will + * arbitrarily pick a usable key for each attribute (and scan direction), + * ensuring that there is no more than one key required in each direction. + * We stop considering further keys once we reach the first nonrequired + * key (which must come after all required keys), so this can't affect us. + * + * The required keys that we use as starting boundaries have to be =, >, + * or >= keys for a forward scan or =, <, <= keys for a backwards scan. + * We can use keys for multiple attributes so long as the prior attributes + * had only =, >= (resp. =, <=) keys. These rules are very similar to the + * rules that preprocessing used to determine which keys to mark required. + * We cannot always use every required key as a positioning key, though. + * Skip arrays necessitate independently applying our own rules here. + * Skip arrays are always generally considered = array keys, but we'll + * nevertheless treat them as inequalities at certain points of the scan. + * When that happens, it _might_ have implications for the number of + * required keys that we can safely use for initial positioning purposes. * - * When both equality and inequality keys appear for a single attribute - * (again, only possible when cross-type operators appear), we *must* - * select one of the equality keys for the starting point, because - * _bt_checkkeys() will stop the scan as soon as an equality qual fails. - * For example, if we have keys like "x >= 4 AND x = 10" and we elect to - * start at x=4, we will fail and stop before reaching x=10. If multiple - * equality quals survive preprocessing, however, it doesn't matter which - * one we use --- by definition, they are either redundant or - * contradictory. + * For example, a forward scan with a skip array on its leading attribute + * (with no low_compare/high_compare) will have at least two required scan + * keys, but we won't use any of them as boundary keys during the scan's + * initial call here. Our positioning key during the first call here can + * be thought of as representing "> -infinity". Similarly, if such a skip + * array's low_compare is "a > 'foo'", then we position using "a > 'foo'" + * during the scan's initial call here; a lower-order key such as "b = 42" + * can't be used until the "a" array advances beyond MINVAL/low_compare. * - * In practice we rarely see any "attribute boundary key gaps" here. - * Preprocessing can usually backfill skip array keys for any attributes - * that were omitted from the original scan->keyData[] input keys. All - * array keys are always considered = keys, but we'll sometimes need to - * treat the current key value as if we were using an inequality strategy. - * This happens with range skip arrays, which store inequality keys in the - * array's low_compare/high_compare fields (used to find the first/last - * set of matches, when = key will lack a usable sk_argument value). - * These are always preferred over any redundant "standard" inequality - * keys on the same column (per the usual rule about preferring = keys). - * Note also that any column with an = skip array key can never have an - * additional, contradictory = key. + * On the other hand, if such a skip array's low_compare was "a >= 'foo'", + * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here. + * A subsequent call here might have us use "a = 'fop' AND b = 42". Note + * that we treat = and >= as equivalent when scanning forwards (just as we + * treat = and <= as equivalent when scanning backwards). We effectively + * do the same thing (though with a distinct "a" element/value) each time. * * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP * array keys whose array is "null_elem=true") imply a NOT NULL qualifier. @@ -1011,41 +1016,38 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * traversing a lot of null entries at the start of the scan. * * In this loop, row-comparison keys are treated the same as keys on their - * first (leftmost) columns. We'll add on lower-order columns of the row - * comparison below, if possible. + * first (leftmost) columns. We'll add all lower-order columns of the row + * comparison that were marked required during preprocessing below. * - * The selected scan keys (at most one per index column) are remembered by - * storing their addresses into the local startKeys[] array. - * - * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start - * the next primitive index scan (for scans with array keys) based in part - * on an understanding of how it'll enable us to reposition the scan. - * They're directly aware of how we'll sometimes cons up an explicit - * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a - * symmetric "deduce NOT NULL" rule of their own. This allows top-level - * scans to skip large groups of NULLs through repeated deductions about - * key strictness (for a required inequality key) and whether NULLs in the - * key's index column are stored last or first (relative to non-NULLs). + * _bt_advance_array_keys needs to know exactly how we'll reposition the + * scan (should it opt to schedule another primitive index scan). It is + * critical that primscans only be scheduled when they'll definitely make + * some useful progress. _bt_advance_array_keys does this by calling + * _bt_checkkeys routines that report whether a tuple is past the end of + * matches for the scan's keys (given the scan's current array elements). + * If the page's final tuple is "after the end of matches" for a scan that + * uses the *opposite* scan direction, then it must follow that it's also + * "before the start of matches" for the actual current scan direction. + * It is therefore essential that all of our initial positioning rules are + * symmetric with _bt_checkkeys's corresponding continuescan=false rule. * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might * need to be kept in sync. *---------- */ - strat_total = BTEqualStrategyNumber; if (so->numberOfKeys > 0) { AttrNumber curattr; - ScanKey chosen; + ScanKey bkey; ScanKey impliesNN; ScanKey cur; /* - * chosen is the so-far-chosen key for the current attribute, if any. - * We don't cast the decision in stone until we reach keys for the - * next attribute. + * bkey will be set to the key that preprocessing left behind as the + * boundary key for this attribute, in this scan direction (if any) */ cur = so->keyData; curattr = 1; - chosen = NULL; + bkey = NULL; /* Also remember any scankey that implies a NOT NULL constraint */ impliesNN = NULL; @@ -1058,23 +1060,29 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) { if (i >= so->numberOfKeys || cur->sk_attno != curattr) { + /* Done looking for the curattr boundary key */ + Assert(bkey == NULL || + (bkey->sk_attno == curattr && + (bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))); + Assert(impliesNN == NULL || + (impliesNN->sk_attno == curattr && + (impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))); + /* - * Done looking at keys for curattr. - * * If this is a scan key for a skip array whose current * element is MINVAL, choose low_compare (when scanning * backwards it'll be MAXVAL, and we'll choose high_compare). * - * Note: if the array's low_compare key makes 'chosen' NULL, + * Note: if the array's low_compare key makes 'bkey' NULL, * then we behave as if the array's first element is -inf, * except when !array->null_elem implies a usable NOT NULL * constraint. */ - if (chosen != NULL && - (chosen->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))) + if (bkey != NULL && + (bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))) { - int ikey = chosen - so->keyData; - ScanKey skipequalitykey = chosen; + int ikey = bkey - so->keyData; + ScanKey skipequalitykey = bkey; BTArrayKeyInfo *array = NULL; for (int arridx = 0; arridx < so->numArrayKeys; arridx++) @@ -1087,42 +1095,41 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (ScanDirectionIsForward(dir)) { Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL)); - chosen = array->low_compare; + bkey = array->low_compare; } else { Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL)); - chosen = array->high_compare; + bkey = array->high_compare; } - Assert(chosen == NULL || - chosen->sk_attno == skipequalitykey->sk_attno); + Assert(bkey == NULL || + bkey->sk_attno == skipequalitykey->sk_attno); if (!array->null_elem) impliesNN = skipequalitykey; else - Assert(chosen == NULL && impliesNN == NULL); + Assert(bkey == NULL && impliesNN == NULL); } /* * If we didn't find a usable boundary key, see if we can * deduce a NOT NULL key */ - if (chosen == NULL && impliesNN != NULL && + if (bkey == NULL && impliesNN != NULL && ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? ScanDirectionIsForward(dir) : ScanDirectionIsBackward(dir))) { - /* Yes, so build the key in notnullkeys[keysz] */ - chosen = ¬nullkeys[keysz]; - ScanKeyEntryInitialize(chosen, + /* Final startKeys[] entry will be deduced NOT NULL key */ + bkey = ¬nullkey; + ScanKeyEntryInitialize(bkey, (SK_SEARCHNOTNULL | SK_ISNULL | (impliesNN->sk_flags & (SK_BT_DESC | SK_BT_NULLS_FIRST))), curattr, - ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? - BTGreaterStrategyNumber : - BTLessStrategyNumber), + ScanDirectionIsForward(dir) ? + BTGreaterStrategyNumber : BTLessStrategyNumber, InvalidOid, InvalidOid, InvalidOid, @@ -1130,12 +1137,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } /* - * If we still didn't find a usable boundary key, quit; else - * save the boundary key pointer in startKeys. + * If preprocessing didn't leave a usable boundary key, quit; + * else save the boundary key pointer in startKeys[] */ - if (chosen == NULL) + if (bkey == NULL) break; - startKeys[keysz++] = chosen; + startKeys[keysz++] = bkey; /* * We can only consider adding more boundary keys when the one @@ -1143,7 +1150,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * (during backwards scans we can only do so when the key that * we just added to startKeys[] uses the = or <= strategy) */ - strat_total = chosen->sk_strategy; + strat_total = bkey->sk_strategy; if (strat_total == BTGreaterStrategyNumber || strat_total == BTLessStrategyNumber) break; @@ -1154,19 +1161,19 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * make strat_total > or < (and stop adding boundary keys). * This can only happen with opclasses that lack skip support. */ - if (chosen->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR)) + if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR)) { - Assert(chosen->sk_flags & SK_BT_SKIP); + Assert(bkey->sk_flags & SK_BT_SKIP); Assert(strat_total == BTEqualStrategyNumber); if (ScanDirectionIsForward(dir)) { - Assert(!(chosen->sk_flags & SK_BT_PRIOR)); + Assert(!(bkey->sk_flags & SK_BT_PRIOR)); strat_total = BTGreaterStrategyNumber; } else { - Assert(!(chosen->sk_flags & SK_BT_NEXT)); + Assert(!(bkey->sk_flags & SK_BT_NEXT)); strat_total = BTLessStrategyNumber; } @@ -1180,24 +1187,30 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) /* * Done if that was the last scan key output by preprocessing. - * Also done if there is a gap index attribute that lacks a - * usable key (only possible when preprocessing was unable to - * generate a skip array key to "fill in the gap"). + * Also done if we've now examined all keys marked required. */ if (i >= so->numberOfKeys || - cur->sk_attno != curattr + 1) + !(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) break; /* * Reset for next attr. */ + Assert(cur->sk_attno == curattr + 1); curattr = cur->sk_attno; - chosen = NULL; + bkey = NULL; impliesNN = NULL; } /* - * Can we use this key as a starting boundary for this attr? + * If we've located the starting boundary key for curattr, we have + * no interest in curattr's other required key + */ + if (bkey != NULL) + continue; + + /* + * Is this key the starting boundary key for curattr? * * If not, does it imply a NOT NULL constraint? (Because * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber, @@ -1207,27 +1220,20 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) { case BTLessStrategyNumber: case BTLessEqualStrategyNumber: - if (chosen == NULL) - { - if (ScanDirectionIsBackward(dir)) - chosen = cur; - else - impliesNN = cur; - } + if (ScanDirectionIsBackward(dir)) + bkey = cur; + else if (impliesNN == NULL) + impliesNN = cur; break; case BTEqualStrategyNumber: - /* override any non-equality choice */ - chosen = cur; + bkey = cur; break; case BTGreaterEqualStrategyNumber: case BTGreaterStrategyNumber: - if (chosen == NULL) - { - if (ScanDirectionIsForward(dir)) - chosen = cur; - else - impliesNN = cur; - } + if (ScanDirectionIsForward(dir)) + bkey = cur; + else if (impliesNN == NULL) + impliesNN = cur; break; } } @@ -1253,16 +1259,18 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) Assert(keysz <= INDEX_MAX_KEYS); for (int i = 0; i < keysz; i++) { - ScanKey cur = startKeys[i]; + ScanKey bkey = startKeys[i]; - Assert(cur->sk_attno == i + 1); + Assert(bkey->sk_attno == i + 1); - if (cur->sk_flags & SK_ROW_HEADER) + if (bkey->sk_flags & SK_ROW_HEADER) { /* * Row comparison header: look to the first row member instead */ - ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument); + ScanKey subkey = (ScanKey) DatumGetPointer(bkey->sk_argument); + bool loosen_strat = false, + tighten_strat = false; /* * Cannot be a NULL in the first row member: _bt_preprocess_keys @@ -1270,9 +1278,18 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * ever getting this far */ Assert(subkey->sk_flags & SK_ROW_MEMBER); - Assert(subkey->sk_attno == cur->sk_attno); + Assert(subkey->sk_attno == bkey->sk_attno); Assert(!(subkey->sk_flags & SK_ISNULL)); + /* + * This is either a > or >= key (during backwards scans it is + * either < or <=) that was marked required during preprocessing. + * Later so->keyData[] keys can't have been marked required, so + * our row compare header key must be the final startKeys[] entry. + */ + Assert(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)); + Assert(i == keysz - 1); + /* * The member scankeys are already in insertion format (ie, they * have sk_func = 3-way-comparison function) @@ -1280,112 +1297,141 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData)); /* - * If the row comparison is the last positioning key we accepted, - * try to add additional keys from the lower-order row members. - * (If we accepted independent conditions on additional index - * columns, we use those instead --- doesn't seem worth trying to - * determine which is more restrictive.) Note that this is OK - * even if the row comparison is of ">" or "<" type, because the - * condition applied to all but the last row member is effectively - * ">=" or "<=", and so the extra keys don't break the positioning - * scheme. But, by the same token, if we aren't able to use all - * the row members, then the part of the row comparison that we - * did use has to be treated as just a ">=" or "<=" condition, and - * so we'd better adjust strat_total accordingly. + * Now look to later row compare members. + * + * If there's an "index attribute gap" between two row compare + * members, the second member won't have been marked required, and + * so can't be used as a starting boundary key here. The part of + * the row comparison that we do still use has to be treated as a + * ">=" or "<=" condition. For example, a qual "(a, c) > (1, 42)" + * with an omitted intervening index attribute "b" will use an + * insertion scan key "a >= 1". Even the first "a = 1" tuple on + * the leaf level might satisfy the row compare qual. + * + * We're able to use a _more_ restrictive strategy when we reach a + * NULL row compare member, since they're always unsatisfiable. + * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an + * insertion scan key "a > 1". All tuples where "a = 1" cannot + * possibly satisfy the row compare qual, so this is safe. */ - if (i == keysz - 1) + Assert(!(subkey->sk_flags & SK_ROW_END)); + for (;;) { - bool used_all_subkeys = false; + subkey++; + Assert(subkey->sk_flags & SK_ROW_MEMBER); - Assert(!(subkey->sk_flags & SK_ROW_END)); - for (;;) + if (subkey->sk_flags & SK_ISNULL) { - subkey++; - Assert(subkey->sk_flags & SK_ROW_MEMBER); - if (subkey->sk_attno != keysz + 1) - break; /* out-of-sequence, can't use it */ - if (subkey->sk_strategy != cur->sk_strategy) - break; /* wrong direction, can't use it */ - if (subkey->sk_flags & SK_ISNULL) - break; /* can't use null keys */ - Assert(keysz < INDEX_MAX_KEYS); - memcpy(inskey.scankeys + keysz, subkey, - sizeof(ScanKeyData)); - keysz++; - if (subkey->sk_flags & SK_ROW_END) - { - used_all_subkeys = true; - break; - } + /* + * NULL member key, can only use earlier keys. + * + * We deliberately avoid checking if this key is marked + * required. All earlier keys are required, and this key + * is unsatisfiable either way, so we can't miss anything. + */ + tighten_strat = true; + break; } - if (!used_all_subkeys) + + if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) { - switch (strat_total) - { - case BTLessStrategyNumber: - strat_total = BTLessEqualStrategyNumber; - break; - case BTGreaterStrategyNumber: - strat_total = BTGreaterEqualStrategyNumber; - break; - } + /* nonrequired member key, can only use earlier keys */ + loosen_strat = true; + break; } - break; /* done with outer loop */ + + Assert(subkey->sk_attno == keysz + 1); + Assert(subkey->sk_strategy == bkey->sk_strategy); + Assert(keysz < INDEX_MAX_KEYS); + + memcpy(inskey.scankeys + keysz, subkey, + sizeof(ScanKeyData)); + keysz++; + if (subkey->sk_flags & SK_ROW_END) + break; } - } - else - { - /* - * Ordinary comparison key. Transform the search-style scan key - * to an insertion scan key by replacing the sk_func with the - * appropriate btree comparison function. - * - * If scankey operator is not a cross-type comparison, we can use - * the cached comparison function; otherwise gotta look it up in - * the catalogs. (That can't lead to infinite recursion, since no - * indexscan initiated by syscache lookup will use cross-data-type - * operators.) - * - * We support the convention that sk_subtype == InvalidOid means - * the opclass input type; this is a hack to simplify life for - * ScanKeyInit(). - */ - if (cur->sk_subtype == rel->rd_opcintype[i] || - cur->sk_subtype == InvalidOid) + Assert(!(loosen_strat && tighten_strat)); + if (loosen_strat) { - FmgrInfo *procinfo; - - procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC); - ScanKeyEntryInitializeWithInfo(inskey.scankeys + i, - cur->sk_flags, - cur->sk_attno, - InvalidStrategy, - cur->sk_subtype, - cur->sk_collation, - procinfo, - cur->sk_argument); + /* Use less restrictive strategy (and fewer member keys) */ + switch (strat_total) + { + case BTLessStrategyNumber: + strat_total = BTLessEqualStrategyNumber; + break; + case BTGreaterStrategyNumber: + strat_total = BTGreaterEqualStrategyNumber; + break; + } } - else + if (tighten_strat) { - RegProcedure cmp_proc; - - cmp_proc = get_opfamily_proc(rel->rd_opfamily[i], - rel->rd_opcintype[i], - cur->sk_subtype, - BTORDER_PROC); - if (!RegProcedureIsValid(cmp_proc)) - elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", - BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype, - cur->sk_attno, RelationGetRelationName(rel)); - ScanKeyEntryInitialize(inskey.scankeys + i, - cur->sk_flags, - cur->sk_attno, - InvalidStrategy, - cur->sk_subtype, - cur->sk_collation, - cmp_proc, - cur->sk_argument); + /* Use more restrictive strategy (and fewer member keys) */ + switch (strat_total) + { + case BTLessEqualStrategyNumber: + strat_total = BTLessStrategyNumber; + break; + case BTGreaterEqualStrategyNumber: + strat_total = BTGreaterStrategyNumber; + break; + } } + + /* done adding to inskey (row comparison keys always come last) */ + break; + } + + /* + * Ordinary comparison key/search-style key. + * + * Transform the search-style scan key to an insertion scan key by + * replacing the sk_func with the appropriate btree 3-way-comparison + * function. + * + * If scankey operator is not a cross-type comparison, we can use the + * cached comparison function; otherwise gotta look it up in the + * catalogs. (That can't lead to infinite recursion, since no + * indexscan initiated by syscache lookup will use cross-data-type + * operators.) + * + * We support the convention that sk_subtype == InvalidOid means the + * opclass input type; this hack simplifies life for ScanKeyInit(). + */ + if (bkey->sk_subtype == rel->rd_opcintype[i] || + bkey->sk_subtype == InvalidOid) + { + FmgrInfo *procinfo; + + procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC); + ScanKeyEntryInitializeWithInfo(inskey.scankeys + i, + bkey->sk_flags, + bkey->sk_attno, + InvalidStrategy, + bkey->sk_subtype, + bkey->sk_collation, + procinfo, + bkey->sk_argument); + } + else + { + RegProcedure cmp_proc; + + cmp_proc = get_opfamily_proc(rel->rd_opfamily[i], + rel->rd_opcintype[i], + bkey->sk_subtype, BTORDER_PROC); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", + BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype, + bkey->sk_attno, RelationGetRelationName(rel)); + ScanKeyEntryInitialize(inskey.scankeys + i, + bkey->sk_flags, + bkey->sk_attno, + InvalidStrategy, + bkey->sk_subtype, + bkey->sk_collation, + cmp_proc, + bkey->sk_argument); } } @@ -1474,6 +1520,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (!BufferIsValid(so->currPos.buf)) { + Assert(!so->needPrimScan); + /* * We only get here if the index is completely empty. Lock relation * because nothing finer to lock exists. Without a buffer lock, it's @@ -1492,7 +1540,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (!BufferIsValid(so->currPos.buf)) { - Assert(!so->needPrimScan); _bt_parallel_done(scan); return false; } diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index c71d1b6f2e1e0..9aed207995f52 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -44,7 +44,6 @@ static bool _bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *arra static bool _bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array); static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir, bool *skip_array_set); -static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir); static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, TupleDesc tupdesc, int tupnatts, bool readpagetup, int sktrig, bool *scanBehind); @@ -52,7 +51,6 @@ static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, int sktrig, bool sktrig_required); #ifdef USE_ASSERT_CHECKING -static bool _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir); static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan); #endif static bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, @@ -1034,73 +1032,6 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir, return false; } -/* - * _bt_rewind_nonrequired_arrays() -- Rewind SAOP arrays not marked required - * - * Called when _bt_advance_array_keys decides to start a new primitive index - * scan on the basis of the current scan position being before the position - * that _bt_first is capable of repositioning the scan to by applying an - * inequality operator required in the opposite-to-scan direction only. - * - * Although equality strategy scan keys (for both arrays and non-arrays alike) - * are either marked required in both directions or in neither direction, - * there is a sense in which non-required arrays behave like required arrays. - * With a qual such as "WHERE a IN (100, 200) AND b >= 3 AND c IN (5, 6, 7)", - * the scan key on "c" is non-required, but nevertheless enables positioning - * the scan at the first tuple >= "(100, 3, 5)" on the leaf level during the - * first descent of the tree by _bt_first. Later on, there could also be a - * second descent, that places the scan right before tuples >= "(200, 3, 5)". - * _bt_first must never be allowed to build an insertion scan key whose "c" - * entry is set to a value other than 5, the "c" array's first element/value. - * (Actually, it's the first in the current scan direction. This example uses - * a forward scan.) - * - * Calling here resets the array scan key elements for the scan's non-required - * arrays. This is strictly necessary for correctness in a subset of cases - * involving "required in opposite direction"-triggered primitive index scans. - * Not all callers are at risk of _bt_first using a non-required array like - * this, but advancement always resets the arrays when another primitive scan - * is scheduled, just to keep things simple. Array advancement even makes - * sure to reset non-required arrays during scans that have no inequalities. - * (Advancement still won't call here when there are no inequalities, though - * that's just because it's all handled indirectly instead.) - * - * Note: _bt_verify_arrays_bt_first is called by an assertion to enforce that - * everybody got this right. - * - * Note: In practice almost all SAOP arrays are marked required during - * preprocessing (if necessary by generating skip arrays). It is hardly ever - * truly necessary to call here, but consistently doing so is simpler. - */ -static void -_bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) -{ - Relation rel = scan->indexRelation; - BTScanOpaque so = (BTScanOpaque) scan->opaque; - int arrayidx = 0; - - for (int ikey = 0; ikey < so->numberOfKeys; ikey++) - { - ScanKey cur = so->keyData + ikey; - BTArrayKeyInfo *array = NULL; - - if (!(cur->sk_flags & SK_SEARCHARRAY) || - cur->sk_strategy != BTEqualStrategyNumber) - continue; - - array = &so->arrayKeys[arrayidx++]; - Assert(array->scan_key == ikey); - - if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) - continue; - - Assert(array->num_elems != -1); /* No non-required skip arrays */ - - _bt_array_set_low_or_high(rel, cur, array, - ScanDirectionIsForward(dir)); - } -} - /* * _bt_tuple_before_array_skeys() -- too early to advance required arrays? * @@ -1380,8 +1311,6 @@ _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir) */ if (so->needPrimScan) { - Assert(_bt_verify_arrays_bt_first(scan, dir)); - /* * Flag was set -- must call _bt_first again, which will reset the * scan's needPrimScan flag @@ -2007,14 +1936,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ else if (has_required_opposite_direction_only && pstate->finaltup && unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup))) - { - /* - * Make sure that any SAOP arrays that were not marked required by - * preprocessing are reset to their first element for this direction - */ - _bt_rewind_nonrequired_arrays(scan, dir); goto new_prim_scan; - } continue_scan: @@ -2045,8 +1967,6 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ so->oppositeDirCheck = has_required_opposite_direction_only; - _bt_rewind_nonrequired_arrays(scan, dir); - /* * skip by setting "look ahead" mechanism's offnum for forwards scans * (backwards scans check scanBehind flag directly instead) @@ -2142,48 +2062,6 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, } #ifdef USE_ASSERT_CHECKING -/* - * Verify that the scan's qual state matches what we expect at the point that - * _bt_start_prim_scan is about to start a just-scheduled new primitive scan. - * - * We enforce a rule against non-required array scan keys: they must start out - * with whatever element is the first for the scan's current scan direction. - * See _bt_rewind_nonrequired_arrays comments for an explanation. - */ -static bool -_bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir) -{ - BTScanOpaque so = (BTScanOpaque) scan->opaque; - int arrayidx = 0; - - for (int ikey = 0; ikey < so->numberOfKeys; ikey++) - { - ScanKey cur = so->keyData + ikey; - BTArrayKeyInfo *array = NULL; - int first_elem_dir; - - if (!(cur->sk_flags & SK_SEARCHARRAY) || - cur->sk_strategy != BTEqualStrategyNumber) - continue; - - array = &so->arrayKeys[arrayidx++]; - - if (((cur->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || - ((cur->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) - continue; - - if (ScanDirectionIsForward(dir)) - first_elem_dir = 0; - else - first_elem_dir = array->num_elems - 1; - - if (array->cur_elem != first_elem_dir) - return false; - } - - return _bt_verify_keys_with_arraykeys(scan); -} - /* * Verify that the scan's "so->keyData[]" scan keys are in agreement with * its array key state @@ -2194,6 +2072,7 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan) BTScanOpaque so = (BTScanOpaque) scan->opaque; int last_sk_attno = InvalidAttrNumber, arrayidx = 0; + bool nonrequiredseen = false; if (!so->qual_ok) return false; @@ -2217,8 +2096,16 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan) if (array->num_elems != -1 && cur->sk_argument != array->elem_values[array->cur_elem]) return false; - if (last_sk_attno > cur->sk_attno) - return false; + if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) + { + if (last_sk_attno > cur->sk_attno) + return false; + if (nonrequiredseen) + return false; + } + else + nonrequiredseen = true; + last_sk_attno = cur->sk_attno; } @@ -2551,37 +2438,12 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate) if (!(key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) { /* Scan key isn't marked required (corner case) */ - Assert(!(key->sk_flags & SK_ROW_HEADER)); break; /* unsafe */ } if (key->sk_flags & SK_ROW_HEADER) { - /* - * RowCompare inequality. - * - * Only the first subkey from a RowCompare can ever be marked - * required (that happens when the row header is marked required). - * There is no simple, general way for us to transitively deduce - * whether or not every tuple on the page satisfies a RowCompare - * key based only on firsttup and lasttup -- so we just give up. - */ - if (!start_past_saop_eq && !so->skipScan) - break; /* unsafe to go further */ - - /* - * We have to be even more careful with RowCompares that come - * after an array: we assume it's unsafe to even bypass the array. - * Calling _bt_start_array_keys to recover the scan's arrays - * following use of forcenonrequired mode isn't compatible with - * _bt_check_rowcompare's continuescan=false behavior with NULL - * row compare members. _bt_advance_array_keys must not make a - * decision on the basis of a key not being satisfied in the - * opposite-to-scan direction until the scan reaches a leaf page - * where the same key begins to be satisfied in scan direction. - * The _bt_first !used_all_subkeys behavior makes this limitation - * hard to work around some other way. - */ - return; /* completely unsafe to set pstate.startikey */ + /* RowCompare inequalities currently aren't supported */ + break; /* "unsafe" */ } if (key->sk_strategy != BTEqualStrategyNumber) { @@ -3078,6 +2940,31 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, Assert(subkey->sk_flags & SK_ROW_MEMBER); + /* When a NULL row member is compared, the row never matches */ + if (subkey->sk_flags & SK_ISNULL) + { + /* + * Unlike the simple-scankey case, this isn't a disallowed case + * (except when it's the first row element that has the NULL arg). + * But it can never match. If all the earlier row comparison + * columns are required for the scan direction, we can stop the + * scan, because there can't be another tuple that will succeed. + */ + Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument)); + subkey--; + if (forcenonrequired) + { + /* treating scan's keys as non-required */ + } + else if ((subkey->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) + *continuescan = false; + else if ((subkey->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + return false; + } + if (subkey->sk_attno > tupnatts) { /* @@ -3087,11 +2974,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * attribute passes the qual. */ Assert(BTreeTupleIsPivot(tuple)); - cmpresult = 0; - if (subkey->sk_flags & SK_ROW_END) - break; - subkey++; - continue; + return true; } datum = index_getattr(tuple, @@ -3101,6 +2984,8 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, if (isNull) { + int reqflags; + if (forcenonrequired) { /* treating scan's keys as non-required */ @@ -3111,15 +2996,35 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * Since NULLs are sorted before non-NULLs, we know we have * reached the lower limit of the range of values for this * index attr. On a backward scan, we can stop if this qual - * is one of the "must match" subset. We can stop regardless - * of whether the qual is > or <, so long as it's required, - * because it's not possible for any future tuples to pass. On - * a forward scan, however, we must keep going, because we may - * have initially positioned to the start of the index. - * (_bt_advance_array_keys also relies on this behavior during - * forward scans.) + * is one of the "must match" subset. However, on a forwards + * scan, we must keep going, because we may have initially + * positioned to the start of the index. + * + * All required NULLS FIRST > row members can use NULL tuple + * values to end backwards scans, just like with other values. + * A qual "WHERE (a, b, c) > (9, 42, 'foo')" can terminate a + * backwards scan upon reaching the index's rightmost "a = 9" + * tuple whose "b" column contains a NULL (if not sooner). + * Since "b" is NULLS FIRST, we can treat its NULLs as "<" 42. + */ + reqflags = SK_BT_REQBKWD; + + /* + * When a most significant required NULLS FIRST < row compare + * member sees NULL tuple values during a backwards scan, it + * signals the end of matches for the whole row compare/scan. + * A qual "WHERE (a, b, c) < (9, 42, 'foo')" will terminate a + * backwards scan upon reaching the rightmost tuple whose "a" + * column has a NULL. The "a" NULL value is "<" 9, and yet + * our < row compare will still end the scan. (This isn't + * safe with later/lower-order row members. Notice that it + * can only happen with an "a" NULL some time after the scan + * completely stops needing to use its "b" and "c" members.) */ - if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + if (subkey == (ScanKey) DatumGetPointer(skey->sk_argument)) + reqflags |= SK_BT_REQFWD; /* safe, first row member */ + + if ((subkey->sk_flags & reqflags) && ScanDirectionIsBackward(dir)) *continuescan = false; } @@ -3129,15 +3034,35 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * Since NULLs are sorted after non-NULLs, we know we have * reached the upper limit of the range of values for this * index attr. On a forward scan, we can stop if this qual is - * one of the "must match" subset. We can stop regardless of - * whether the qual is > or <, so long as it's required, - * because it's not possible for any future tuples to pass. On - * a backward scan, however, we must keep going, because we - * may have initially positioned to the end of the index. - * (_bt_advance_array_keys also relies on this behavior during - * backward scans.) + * one of the "must match" subset. However, on a backward + * scan, we must keep going, because we may have initially + * positioned to the end of the index. + * + * All required NULLS LAST < row members can use NULL tuple + * values to end forwards scans, just like with other values. + * A qual "WHERE (a, b, c) < (9, 42, 'foo')" can terminate a + * forwards scan upon reaching the index's leftmost "a = 9" + * tuple whose "b" column contains a NULL (if not sooner). + * Since "b" is NULLS LAST, we can treat its NULLs as ">" 42. */ - if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + reqflags = SK_BT_REQFWD; + + /* + * When a most significant required NULLS LAST > row compare + * member sees NULL tuple values during a forwards scan, it + * signals the end of matches for the whole row compare/scan. + * A qual "WHERE (a, b, c) > (9, 42, 'foo')" will terminate a + * forwards scan upon reaching the leftmost tuple whose "a" + * column has a NULL. The "a" NULL value is ">" 9, and yet + * our > row compare will end the scan. (This isn't safe with + * later/lower-order row members. Notice that it can only + * happen with an "a" NULL some time after the scan completely + * stops needing to use its "b" and "c" members.) + */ + if (subkey == (ScanKey) DatumGetPointer(skey->sk_argument)) + reqflags |= SK_BT_REQBKWD; /* safe, first row member */ + + if ((subkey->sk_flags & reqflags) && ScanDirectionIsForward(dir)) *continuescan = false; } @@ -3148,30 +3073,6 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, return false; } - if (subkey->sk_flags & SK_ISNULL) - { - /* - * Unlike the simple-scankey case, this isn't a disallowed case - * (except when it's the first row element that has the NULL arg). - * But it can never match. If all the earlier row comparison - * columns are required for the scan direction, we can stop the - * scan, because there can't be another tuple that will succeed. - */ - Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument)); - subkey--; - if (forcenonrequired) - { - /* treating scan's keys as non-required */ - } - else if ((subkey->sk_flags & SK_BT_REQFWD) && - ScanDirectionIsForward(dir)) - *continuescan = false; - else if ((subkey->sk_flags & SK_BT_REQBKWD) && - ScanDirectionIsBackward(dir)) - *continuescan = false; - return false; - } - /* Perform the test --- three-way comparison not bool operator */ cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func, subkey->sk_collation, diff --git a/src/backend/access/rmgrdesc/replorigindesc.c b/src/backend/access/rmgrdesc/replorigindesc.c index 5dd742339969a..35e3af2903ed2 100644 --- a/src/backend/access/rmgrdesc/replorigindesc.c +++ b/src/backend/access/rmgrdesc/replorigindesc.c @@ -29,7 +29,7 @@ replorigin_desc(StringInfo buf, XLogReaderState *record) xlrec = (xl_replorigin_set *) rec; - appendStringInfo(buf, "set %u; lsn %X/%X; force: %d", + appendStringInfo(buf, "set %u; lsn %X/%08X; force: %d", xlrec->node_id, LSN_FORMAT_ARGS(xlrec->remote_lsn), xlrec->force); diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c index 305598e2865c8..f0f696855b9af 100644 --- a/src/backend/access/rmgrdesc/xactdesc.c +++ b/src/backend/access/rmgrdesc/xactdesc.c @@ -359,7 +359,7 @@ xact_desc_commit(StringInfo buf, uint8 info, xl_xact_commit *xlrec, RepOriginId if (parsed.xinfo & XACT_XINFO_HAS_ORIGIN) { - appendStringInfo(buf, "; origin: node %u, lsn %X/%X, at %s", + appendStringInfo(buf, "; origin: node %u, lsn %X/%08X, at %s", origin_id, LSN_FORMAT_ARGS(parsed.origin_lsn), timestamptz_to_str(parsed.origin_timestamp)); @@ -384,7 +384,7 @@ xact_desc_abort(StringInfo buf, uint8 info, xl_xact_abort *xlrec, RepOriginId or if (parsed.xinfo & XACT_XINFO_HAS_ORIGIN) { - appendStringInfo(buf, "; origin: node %u, lsn %X/%X, at %s", + appendStringInfo(buf, "; origin: node %u, lsn %X/%08X, at %s", origin_id, LSN_FORMAT_ARGS(parsed.origin_lsn), timestamptz_to_str(parsed.origin_timestamp)); @@ -418,7 +418,7 @@ xact_desc_prepare(StringInfo buf, uint8 info, xl_xact_prepare *xlrec, RepOriginI * way as PrepareRedoAdd(). */ if (origin_id != InvalidRepOriginId) - appendStringInfo(buf, "; origin: node %u, lsn %X/%X, at %s", + appendStringInfo(buf, "; origin: node %u, lsn %X/%08X, at %s", origin_id, LSN_FORMAT_ARGS(parsed.origin_lsn), timestamptz_to_str(parsed.origin_timestamp)); diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 58040f28656fc..cd6c2a2f650a6 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -65,7 +65,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) { CheckPoint *checkpoint = (CheckPoint *) rec; - appendStringInfo(buf, "redo %X/%X; " + appendStringInfo(buf, "redo %X/%08X; " "tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %u; " "oldest xid %u in DB %u; oldest multi %u in DB %u; " "oldest/newest commit timestamp xid: %u/%u; " @@ -111,7 +111,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) XLogRecPtr startpoint; memcpy(&startpoint, rec, sizeof(XLogRecPtr)); - appendStringInfo(buf, "%X/%X", LSN_FORMAT_ARGS(startpoint)); + appendStringInfo(buf, "%X/%08X", LSN_FORMAT_ARGS(startpoint)); } else if (info == XLOG_PARAMETER_CHANGE) { @@ -156,7 +156,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) xl_overwrite_contrecord xlrec; memcpy(&xlrec, rec, sizeof(xl_overwrite_contrecord)); - appendStringInfo(buf, "lsn %X/%X; time %s", + appendStringInfo(buf, "lsn %X/%08X; time %s", LSN_FORMAT_ARGS(xlrec.overwritten_lsn), timestamptz_to_str(xlrec.overwrite_time)); } diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 48f10bec91e12..e80fbe109cf3c 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -110,9 +110,7 @@ static SlruCtlData XactCtlData; #define XactCtl (&XactCtlData) -static int ZeroCLOGPage(int64 pageno, bool writeXlog); static bool CLOGPagePrecedes(int64 page1, int64 page2); -static void WriteZeroPageXlogRec(int64 pageno); static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact, Oid oldestXactDb); static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids, @@ -832,41 +830,8 @@ check_transaction_buffers(int *newval, void **extra, GucSource source) void BootStrapCLOG(void) { - int slotno; - LWLock *lock = SimpleLruGetBankLock(XactCtl, 0); - - LWLockAcquire(lock, LW_EXCLUSIVE); - - /* Create and zero the first page of the commit log */ - slotno = ZeroCLOGPage(0, false); - - /* Make sure it's written out */ - SimpleLruWritePage(XactCtl, slotno); - Assert(!XactCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); -} - -/* - * Initialize (or reinitialize) a page of CLOG to zeroes. - * If writeXlog is true, also emit an XLOG record saying we did this. - * - * The page is not actually written, just set up in shared memory. - * The slot number of the new page is returned. - * - * Control lock must be held at entry, and will be held at exit. - */ -static int -ZeroCLOGPage(int64 pageno, bool writeXlog) -{ - int slotno; - - slotno = SimpleLruZeroPage(XactCtl, pageno); - - if (writeXlog) - WriteZeroPageXlogRec(pageno); - - return slotno; + /* Zero the initial page and flush it to disk */ + SimpleLruZeroAndWritePage(XactCtl, 0); } /* @@ -974,8 +939,9 @@ ExtendCLOG(TransactionId newestXact) LWLockAcquire(lock, LW_EXCLUSIVE); - /* Zero the page and make an XLOG entry about it */ - ZeroCLOGPage(pageno, true); + /* Zero the page and make a WAL entry about it */ + SimpleLruZeroPage(XactCtl, pageno); + XLogSimpleInsertInt64(RM_CLOG_ID, CLOG_ZEROPAGE, pageno); LWLockRelease(lock); } @@ -1067,17 +1033,6 @@ CLOGPagePrecedes(int64 page1, int64 page2) } -/* - * Write a ZEROPAGE xlog record - */ -static void -WriteZeroPageXlogRec(int64 pageno) -{ - XLogBeginInsert(); - XLogRegisterData(&pageno, sizeof(pageno)); - (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE); -} - /* * Write a TRUNCATE xlog record * @@ -1114,19 +1069,9 @@ clog_redo(XLogReaderState *record) if (info == CLOG_ZEROPAGE) { int64 pageno; - int slotno; - LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); - - lock = SimpleLruGetBankLock(XactCtl, pageno); - LWLockAcquire(lock, LW_EXCLUSIVE); - - slotno = ZeroCLOGPage(pageno, false); - SimpleLruWritePage(XactCtl, slotno); - Assert(!XactCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); + SimpleLruZeroAndWritePage(XactCtl, pageno); } else if (info == CLOG_TRUNCATE) { diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c index 113fae1437ad8..370b38e048b91 100644 --- a/src/backend/access/transam/commit_ts.c +++ b/src/backend/access/transam/commit_ts.c @@ -114,11 +114,9 @@ static void SetXidCommitTsInPage(TransactionId xid, int nsubxids, static void TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts, RepOriginId nodeid, int slotno); static void error_commit_ts_disabled(void); -static int ZeroCommitTsPage(int64 pageno, bool writeXlog); static bool CommitTsPagePrecedes(int64 page1, int64 page2); static void ActivateCommitTs(void); static void DeactivateCommitTs(void); -static void WriteZeroPageXlogRec(int64 pageno); static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXid); /* @@ -602,28 +600,6 @@ BootStrapCommitTs(void) */ } -/* - * Initialize (or reinitialize) a page of CommitTs to zeroes. - * If writeXlog is true, also emit an XLOG record saying we did this. - * - * The page is not actually written, just set up in shared memory. - * The slot number of the new page is returned. - * - * Control lock must be held at entry, and will be held at exit. - */ -static int -ZeroCommitTsPage(int64 pageno, bool writeXlog) -{ - int slotno; - - slotno = SimpleLruZeroPage(CommitTsCtl, pageno); - - if (writeXlog) - WriteZeroPageXlogRec(pageno); - - return slotno; -} - /* * This must be called ONCE during postmaster or standalone-backend startup, * after StartupXLOG has initialized TransamVariables->nextXid. @@ -707,6 +683,13 @@ ActivateCommitTs(void) TransactionId xid; int64 pageno; + /* + * During bootstrap, we should not register commit timestamps so skip the + * activation in this case. + */ + if (IsBootstrapProcessingMode()) + return; + /* If we've done this already, there's nothing to do */ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); if (commitTsShared->commitTsActive) @@ -747,16 +730,7 @@ ActivateCommitTs(void) /* Create the current segment file, if necessary */ if (!SimpleLruDoesPhysicalPageExist(CommitTsCtl, pageno)) - { - LWLock *lock = SimpleLruGetBankLock(CommitTsCtl, pageno); - int slotno; - - LWLockAcquire(lock, LW_EXCLUSIVE); - slotno = ZeroCommitTsPage(pageno, false); - SimpleLruWritePage(CommitTsCtl, slotno); - Assert(!CommitTsCtl->shared->page_dirty[slotno]); - LWLockRelease(lock); - } + SimpleLruZeroAndWritePage(CommitTsCtl, pageno); /* Change the activation status in shared memory. */ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); @@ -867,8 +841,12 @@ ExtendCommitTs(TransactionId newestXact) LWLockAcquire(lock, LW_EXCLUSIVE); - /* Zero the page and make an XLOG entry about it */ - ZeroCommitTsPage(pageno, !InRecovery); + /* Zero the page ... */ + SimpleLruZeroPage(CommitTsCtl, pageno); + + /* and make a WAL entry about that, unless we're in REDO */ + if (!InRecovery) + XLogSimpleInsertInt64(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE, pageno); LWLockRelease(lock); } @@ -982,17 +960,6 @@ CommitTsPagePrecedes(int64 page1, int64 page2) } -/* - * Write a ZEROPAGE xlog record - */ -static void -WriteZeroPageXlogRec(int64 pageno) -{ - XLogBeginInsert(); - XLogRegisterData(&pageno, sizeof(pageno)); - (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE); -} - /* * Write a TRUNCATE xlog record */ @@ -1023,19 +990,9 @@ commit_ts_redo(XLogReaderState *record) if (info == COMMIT_TS_ZEROPAGE) { int64 pageno; - int slotno; - LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); - - lock = SimpleLruGetBankLock(CommitTsCtl, pageno); - LWLockAcquire(lock, LW_EXCLUSIVE); - - slotno = ZeroCommitTsPage(pageno, false); - SimpleLruWritePage(CommitTsCtl, slotno); - Assert(!CommitTsCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); + SimpleLruZeroAndWritePage(CommitTsCtl, pageno); } else if (info == COMMIT_TS_TRUNCATE) { diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 3c06ac45532f8..3cb09c3d5987c 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -401,8 +401,6 @@ static void mXactCachePut(MultiXactId multi, int nmembers, static char *mxstatus_to_string(MultiXactStatus status); /* management of SLRU infrastructure */ -static int ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog); -static int ZeroMultiXactMemberPage(int64 pageno, bool writeXlog); static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2); static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2); static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, @@ -413,7 +411,6 @@ static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, uint32 distance); static bool SetOffsetVacuumLimit(bool is_startup); static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result); -static void WriteMZeroPageXlogRec(int64 pageno, uint8 info); static void WriteMTruncateXlogRec(Oid oldestMultiDB, MultiXactId startTruncOff, MultiXactId endTruncOff, @@ -1847,7 +1844,7 @@ AtPrepare_MultiXact(void) * Clean up after successful PREPARE TRANSACTION */ void -PostPrepare_MultiXact(TransactionId xid) +PostPrepare_MultiXact(FullTransactionId fxid) { MultiXactId myOldestMember; @@ -1858,7 +1855,7 @@ PostPrepare_MultiXact(TransactionId xid) myOldestMember = OldestMemberMXactId[MyProcNumber]; if (MultiXactIdIsValid(myOldestMember)) { - ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, false); + ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false); /* * Even though storing MultiXactId is atomic, acquire lock to make @@ -1896,10 +1893,10 @@ PostPrepare_MultiXact(TransactionId xid) * Recover the state of a prepared transaction at startup */ void -multixact_twophase_recover(TransactionId xid, uint16 info, +multixact_twophase_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { - ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, false); + ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false); MultiXactId oldestMember; /* @@ -1917,10 +1914,10 @@ multixact_twophase_recover(TransactionId xid, uint16 info, * Similar to AtEOXact_MultiXact but for COMMIT PREPARED */ void -multixact_twophase_postcommit(TransactionId xid, uint16 info, +multixact_twophase_postcommit(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { - ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, true); + ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, true); Assert(len == sizeof(MultiXactId)); @@ -1932,10 +1929,10 @@ multixact_twophase_postcommit(TransactionId xid, uint16 info, * This is actually just the same as the COMMIT case. */ void -multixact_twophase_postabort(TransactionId xid, uint16 info, +multixact_twophase_postabort(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { - multixact_twophase_postcommit(xid, info, recdata, len); + multixact_twophase_postcommit(fxid, info, recdata, len); } /* @@ -2033,70 +2030,9 @@ check_multixact_member_buffers(int *newval, void **extra, GucSource source) void BootStrapMultiXact(void) { - int slotno; - LWLock *lock; - - lock = SimpleLruGetBankLock(MultiXactOffsetCtl, 0); - LWLockAcquire(lock, LW_EXCLUSIVE); - - /* Create and zero the first page of the offsets log */ - slotno = ZeroMultiXactOffsetPage(0, false); - - /* Make sure it's written out */ - SimpleLruWritePage(MultiXactOffsetCtl, slotno); - Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); - - lock = SimpleLruGetBankLock(MultiXactMemberCtl, 0); - LWLockAcquire(lock, LW_EXCLUSIVE); - - /* Create and zero the first page of the members log */ - slotno = ZeroMultiXactMemberPage(0, false); - - /* Make sure it's written out */ - SimpleLruWritePage(MultiXactMemberCtl, slotno); - Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); -} - -/* - * Initialize (or reinitialize) a page of MultiXactOffset to zeroes. - * If writeXlog is true, also emit an XLOG record saying we did this. - * - * The page is not actually written, just set up in shared memory. - * The slot number of the new page is returned. - * - * Control lock must be held at entry, and will be held at exit. - */ -static int -ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog) -{ - int slotno; - - slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); - - if (writeXlog) - WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE); - - return slotno; -} - -/* - * Ditto, for MultiXactMember - */ -static int -ZeroMultiXactMemberPage(int64 pageno, bool writeXlog) -{ - int slotno; - - slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno); - - if (writeXlog) - WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE); - - return slotno; + /* Zero the initial pages and flush them to disk */ + SimpleLruZeroAndWritePage(MultiXactOffsetCtl, 0); + SimpleLruZeroAndWritePage(MultiXactMemberCtl, 0); } /* @@ -2134,7 +2070,7 @@ MaybeExtendOffsetSlru(void) * with creating a new segment file even if the page we're writing is * not the first in it, so this is enough. */ - slotno = ZeroMultiXactOffsetPage(pageno, false); + slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); SimpleLruWritePage(MultiXactOffsetCtl, slotno); } @@ -2568,8 +2504,10 @@ ExtendMultiXactOffset(MultiXactId multi) LWLockAcquire(lock, LW_EXCLUSIVE); - /* Zero the page and make an XLOG entry about it */ - ZeroMultiXactOffsetPage(pageno, true); + /* Zero the page and make a WAL entry about it */ + SimpleLruZeroPage(MultiXactOffsetCtl, pageno); + XLogSimpleInsertInt64(RM_MULTIXACT_ID, XLOG_MULTIXACT_ZERO_OFF_PAGE, + pageno); LWLockRelease(lock); } @@ -2611,8 +2549,10 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers) LWLockAcquire(lock, LW_EXCLUSIVE); - /* Zero the page and make an XLOG entry about it */ - ZeroMultiXactMemberPage(pageno, true); + /* Zero the page and make a WAL entry about it */ + SimpleLruZeroPage(MultiXactMemberCtl, pageno); + XLogSimpleInsertInt64(RM_MULTIXACT_ID, + XLOG_MULTIXACT_ZERO_MEM_PAGE, pageno); LWLockRelease(lock); } @@ -3347,18 +3287,6 @@ MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) return (diff < 0); } -/* - * Write an xlog record reflecting the zeroing of either a MEMBERs or - * OFFSETs page (info shows which) - */ -static void -WriteMZeroPageXlogRec(int64 pageno, uint8 info) -{ - XLogBeginInsert(); - XLogRegisterData(&pageno, sizeof(pageno)); - (void) XLogInsert(RM_MULTIXACT_ID, info); -} - /* * Write a TRUNCATE xlog record * @@ -3401,36 +3329,16 @@ multixact_redo(XLogReaderState *record) if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE) { int64 pageno; - int slotno; - LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); - - lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); - LWLockAcquire(lock, LW_EXCLUSIVE); - - slotno = ZeroMultiXactOffsetPage(pageno, false); - SimpleLruWritePage(MultiXactOffsetCtl, slotno); - Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); + SimpleLruZeroAndWritePage(MultiXactOffsetCtl, pageno); } else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) { int64 pageno; - int slotno; - LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); - - lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno); - LWLockAcquire(lock, LW_EXCLUSIVE); - - slotno = ZeroMultiXactMemberPage(pageno, false); - SimpleLruWritePage(MultiXactMemberCtl, slotno); - Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); + SimpleLruZeroAndWritePage(MultiXactMemberCtl, pageno); } else if (info == XLOG_MULTIXACT_CREATE_ID) { diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index fe56286d9a972..10ec259f38295 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -433,6 +433,31 @@ SimpleLruZeroLSNs(SlruCtl ctl, int slotno) shared->lsn_groups_per_page * sizeof(XLogRecPtr)); } +/* + * This is a convenience wrapper for the common case of zeroing a page and + * immediately flushing it to disk. + * + * Control lock is acquired and released here. + */ +void +SimpleLruZeroAndWritePage(SlruCtl ctl, int64 pageno) +{ + int slotno; + LWLock *lock; + + lock = SimpleLruGetBankLock(ctl, pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); + + /* Create and zero the page */ + slotno = SimpleLruZeroPage(ctl, pageno); + + /* Make sure it's written out */ + SimpleLruWritePage(ctl, slotno); + Assert(!ctl->shared->page_dirty[slotno]); + + LWLockRelease(lock); +} + /* * Wait for any active I/O on a page slot to finish. (This does not * guarantee that new I/O hasn't been started before we return, though. diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index 15153618fad16..09aace9e09f0e 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -74,7 +74,6 @@ static SlruCtlData SubTransCtlData; #define SubTransCtl (&SubTransCtlData) -static int ZeroSUBTRANSPage(int64 pageno); static bool SubTransPagePrecedes(int64 page1, int64 page2); @@ -269,33 +268,8 @@ check_subtrans_buffers(int *newval, void **extra, GucSource source) void BootStrapSUBTRANS(void) { - int slotno; - LWLock *lock = SimpleLruGetBankLock(SubTransCtl, 0); - - LWLockAcquire(lock, LW_EXCLUSIVE); - - /* Create and zero the first page of the subtrans log */ - slotno = ZeroSUBTRANSPage(0); - - /* Make sure it's written out */ - SimpleLruWritePage(SubTransCtl, slotno); - Assert(!SubTransCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); -} - -/* - * Initialize (or reinitialize) a page of SUBTRANS to zeroes. - * - * The page is not actually written, just set up in shared memory. - * The slot number of the new page is returned. - * - * Control lock must be held at entry, and will be held at exit. - */ -static int -ZeroSUBTRANSPage(int64 pageno) -{ - return SimpleLruZeroPage(SubTransCtl, pageno); + /* Zero the initial page and flush it to disk */ + SimpleLruZeroAndWritePage(SubTransCtl, 0); } /* @@ -335,7 +309,7 @@ StartupSUBTRANS(TransactionId oldestActiveXID) prevlock = lock; } - (void) ZeroSUBTRANSPage(startPage); + (void) SimpleLruZeroPage(SubTransCtl, startPage); if (startPage == endPage) break; @@ -395,7 +369,7 @@ ExtendSUBTRANS(TransactionId newestXact) LWLockAcquire(lock, LW_EXCLUSIVE); /* Zero the page */ - ZeroSUBTRANSPage(pageno); + SimpleLruZeroPage(SubTransCtl, pageno); LWLockRelease(lock); } diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c index a27f27cc037d1..186eb91f60943 100644 --- a/src/backend/access/transam/timeline.c +++ b/src/backend/access/transam/timeline.c @@ -154,7 +154,7 @@ readTimeLineHistory(TimeLineID targetTLI) if (*ptr == '\0' || *ptr == '#') continue; - nfields = sscanf(fline, "%u\t%X/%X", &tli, &switchpoint_hi, &switchpoint_lo); + nfields = sscanf(fline, "%u\t%X/%08X", &tli, &switchpoint_hi, &switchpoint_lo); if (nfields < 1) { @@ -399,7 +399,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, * parent file failed to end with one. */ snprintf(buffer, sizeof(buffer), - "%s%u\t%X/%X\t%s\n", + "%s%u\t%X/%08X\t%s\n", (srcfd < 0) ? "" : "\n", parentTLI, LSN_FORMAT_ARGS(switchpoint), diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 73a80559194e7..7918176fc588e 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -159,7 +159,7 @@ typedef struct GlobalTransactionData */ XLogRecPtr prepare_start_lsn; /* XLOG offset of prepare record start */ XLogRecPtr prepare_end_lsn; /* XLOG offset of prepare record end */ - TransactionId xid; /* The GXACT id */ + FullTransactionId fxid; /* The GXACT full xid */ Oid owner; /* ID of user that executed the xact */ ProcNumber locking_backend; /* backend currently working on the xact */ @@ -197,6 +197,7 @@ static GlobalTransaction MyLockedGxact = NULL; static bool twophaseExitRegistered = false; +static void PrepareRedoRemoveFull(FullTransactionId fxid, bool giveWarning); static void RecordTransactionCommitPrepared(TransactionId xid, int nchildren, TransactionId *children, @@ -216,19 +217,19 @@ static void RecordTransactionAbortPrepared(TransactionId xid, int nstats, xl_xact_stats_item *stats, const char *gid); -static void ProcessRecords(char *bufptr, TransactionId xid, +static void ProcessRecords(char *bufptr, FullTransactionId fxid, const TwoPhaseCallback callbacks[]); static void RemoveGXact(GlobalTransaction gxact); static void XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len); -static char *ProcessTwoPhaseBuffer(TransactionId xid, +static char *ProcessTwoPhaseBuffer(FullTransactionId fxid, XLogRecPtr prepare_start_lsn, bool fromdisk, bool setParent, bool setNextXid); -static void MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, +static void MarkAsPreparingGuts(GlobalTransaction gxact, FullTransactionId fxid, const char *gid, TimestampTz prepared_at, Oid owner, Oid databaseid); -static void RemoveTwoPhaseFile(TransactionId xid, bool giveWarning); -static void RecreateTwoPhaseFile(TransactionId xid, void *content, int len); +static void RemoveTwoPhaseFile(FullTransactionId fxid, bool giveWarning); +static void RecreateTwoPhaseFile(FullTransactionId fxid, void *content, int len); /* * Initialization of shared memory @@ -356,7 +357,7 @@ PostPrepare_Twophase(void) * Reserve the GID for the given transaction. */ GlobalTransaction -MarkAsPreparing(TransactionId xid, const char *gid, +MarkAsPreparing(FullTransactionId fxid, const char *gid, TimestampTz prepared_at, Oid owner, Oid databaseid) { GlobalTransaction gxact; @@ -407,7 +408,7 @@ MarkAsPreparing(TransactionId xid, const char *gid, gxact = TwoPhaseState->freeGXacts; TwoPhaseState->freeGXacts = gxact->next; - MarkAsPreparingGuts(gxact, xid, gid, prepared_at, owner, databaseid); + MarkAsPreparingGuts(gxact, fxid, gid, prepared_at, owner, databaseid); gxact->ondisk = false; @@ -430,11 +431,13 @@ MarkAsPreparing(TransactionId xid, const char *gid, * Note: This function should be called with appropriate locks held. */ static void -MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, - TimestampTz prepared_at, Oid owner, Oid databaseid) +MarkAsPreparingGuts(GlobalTransaction gxact, FullTransactionId fxid, + const char *gid, TimestampTz prepared_at, Oid owner, + Oid databaseid) { PGPROC *proc; int i; + TransactionId xid = XidFromFullTransactionId(fxid); Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE)); @@ -479,7 +482,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, proc->subxidStatus.count = 0; gxact->prepared_at = prepared_at; - gxact->xid = xid; + gxact->fxid = fxid; gxact->owner = owner; gxact->locking_backend = MyProcNumber; gxact->valid = false; @@ -797,12 +800,12 @@ pg_prepared_xact(PG_FUNCTION_ARGS) * caller had better hold it. */ static GlobalTransaction -TwoPhaseGetGXact(TransactionId xid, bool lock_held) +TwoPhaseGetGXact(FullTransactionId fxid, bool lock_held) { GlobalTransaction result = NULL; int i; - static TransactionId cached_xid = InvalidTransactionId; + static FullTransactionId cached_fxid = {InvalidTransactionId}; static GlobalTransaction cached_gxact = NULL; Assert(!lock_held || LWLockHeldByMe(TwoPhaseStateLock)); @@ -811,7 +814,7 @@ TwoPhaseGetGXact(TransactionId xid, bool lock_held) * During a recovery, COMMIT PREPARED, or ABORT PREPARED, we'll be called * repeatedly for the same XID. We can save work with a simple cache. */ - if (xid == cached_xid) + if (FullTransactionIdEquals(fxid, cached_fxid)) return cached_gxact; if (!lock_held) @@ -821,7 +824,7 @@ TwoPhaseGetGXact(TransactionId xid, bool lock_held) { GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; - if (gxact->xid == xid) + if (FullTransactionIdEquals(gxact->fxid, fxid)) { result = gxact; break; @@ -832,9 +835,10 @@ TwoPhaseGetGXact(TransactionId xid, bool lock_held) LWLockRelease(TwoPhaseStateLock); if (result == NULL) /* should not happen */ - elog(ERROR, "failed to find GlobalTransaction for xid %u", xid); + elog(ERROR, "failed to find GlobalTransaction for xid %u", + XidFromFullTransactionId(fxid)); - cached_xid = xid; + cached_fxid = fxid; cached_gxact = result; return result; @@ -881,7 +885,7 @@ TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid, *have_more = true; break; } - result = gxact->xid; + result = XidFromFullTransactionId(gxact->fxid); } } @@ -892,7 +896,7 @@ TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid, /* * TwoPhaseGetDummyProcNumber - * Get the dummy proc number for prepared transaction specified by XID + * Get the dummy proc number for prepared transaction * * Dummy proc numbers are similar to proc numbers of real backends. They * start at MaxBackends, and are unique across all currently active real @@ -900,24 +904,24 @@ TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid, * TwoPhaseStateLock will not be taken, so the caller had better hold it. */ ProcNumber -TwoPhaseGetDummyProcNumber(TransactionId xid, bool lock_held) +TwoPhaseGetDummyProcNumber(FullTransactionId fxid, bool lock_held) { - GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held); + GlobalTransaction gxact = TwoPhaseGetGXact(fxid, lock_held); return gxact->pgprocno; } /* * TwoPhaseGetDummyProc - * Get the PGPROC that represents a prepared transaction specified by XID + * Get the PGPROC that represents a prepared transaction * * If lock_held is set to true, TwoPhaseStateLock will not be taken, so the * caller had better hold it. */ PGPROC * -TwoPhaseGetDummyProc(TransactionId xid, bool lock_held) +TwoPhaseGetDummyProc(FullTransactionId fxid, bool lock_held) { - GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held); + GlobalTransaction gxact = TwoPhaseGetGXact(fxid, lock_held); return GetPGProcByNumber(gxact->pgprocno); } @@ -942,10 +946,8 @@ AdjustToFullTransactionId(TransactionId xid) } static inline int -TwoPhaseFilePath(char *path, TransactionId xid) +TwoPhaseFilePath(char *path, FullTransactionId fxid) { - FullTransactionId fxid = AdjustToFullTransactionId(xid); - return snprintf(path, MAXPGPATH, TWOPHASE_DIR "/%08X%08X", EpochFromFullTransactionId(fxid), XidFromFullTransactionId(fxid)); @@ -1049,7 +1051,7 @@ void StartPrepare(GlobalTransaction gxact) { PGPROC *proc = GetPGProcByNumber(gxact->pgprocno); - TransactionId xid = gxact->xid; + TransactionId xid = XidFromFullTransactionId(gxact->fxid); TwoPhaseFileHeader hdr; TransactionId *children; RelFileLocator *commitrels; @@ -1181,7 +1183,11 @@ EndPrepare(GlobalTransaction gxact) * starting immediately after the WAL record is inserted could complete * without fsync'ing our state file. (This is essentially the same kind * of race condition as the COMMIT-to-clog-write case that - * RecordTransactionCommit uses DELAY_CHKPT_START for; see notes there.) + * RecordTransactionCommit uses DELAY_CHKPT_IN_COMMIT for; see notes + * there.) Note that DELAY_CHKPT_IN_COMMIT is used to find transactions in + * the critical commit section. We need to know about such transactions + * for conflict detection in logical replication. See + * GetOldestActiveTransactionId(true, false) and its use. * * We save the PREPARE record's location in the gxact for later use by * CheckPointTwoPhase. @@ -1281,10 +1287,11 @@ RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info, * If it looks OK (has a valid magic number and CRC), return the palloc'd * contents of the file, issuing an error when finding corrupted data. If * missing_ok is true, which indicates that missing files can be safely - * ignored, then return NULL. This state can be reached when doing recovery. + * ignored, then return NULL. This state can be reached when doing recovery + * after discarding two-phase files from frozen epochs. */ static char * -ReadTwoPhaseFile(TransactionId xid, bool missing_ok) +ReadTwoPhaseFile(FullTransactionId fxid, bool missing_ok) { char path[MAXPGPATH]; char *buf; @@ -1296,7 +1303,7 @@ ReadTwoPhaseFile(TransactionId xid, bool missing_ok) file_crc; int r; - TwoPhaseFilePath(path, xid); + TwoPhaseFilePath(path, fxid); fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); if (fd < 0) @@ -1426,12 +1433,12 @@ XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len) if (errormsg) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not read two-phase state from WAL at %X/%X: %s", + errmsg("could not read two-phase state from WAL at %X/%08X: %s", LSN_FORMAT_ARGS(lsn), errormsg))); else ereport(ERROR, (errcode_for_file_access(), - errmsg("could not read two-phase state from WAL at %X/%X", + errmsg("could not read two-phase state from WAL at %X/%08X", LSN_FORMAT_ARGS(lsn)))); } @@ -1439,7 +1446,7 @@ XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len) (XLogRecGetInfo(xlogreader) & XLOG_XACT_OPMASK) != XLOG_XACT_PREPARE) ereport(ERROR, (errcode_for_file_access(), - errmsg("expected two-phase state data is not present in WAL at %X/%X", + errmsg("expected two-phase state data is not present in WAL at %X/%08X", LSN_FORMAT_ARGS(lsn)))); if (len != NULL) @@ -1461,6 +1468,7 @@ StandbyTransactionIdIsPrepared(TransactionId xid) char *buf; TwoPhaseFileHeader *hdr; bool result; + FullTransactionId fxid; Assert(TransactionIdIsValid(xid)); @@ -1468,7 +1476,8 @@ StandbyTransactionIdIsPrepared(TransactionId xid) return false; /* nothing to do */ /* Read and validate file */ - buf = ReadTwoPhaseFile(xid, true); + fxid = AdjustToFullTransactionId(xid); + buf = ReadTwoPhaseFile(fxid, true); if (buf == NULL) return false; @@ -1488,6 +1497,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit) { GlobalTransaction gxact; PGPROC *proc; + FullTransactionId fxid; TransactionId xid; bool ondisk; char *buf; @@ -1509,7 +1519,8 @@ FinishPreparedTransaction(const char *gid, bool isCommit) */ gxact = LockGXact(gid, GetUserId()); proc = GetPGProcByNumber(gxact->pgprocno); - xid = gxact->xid; + fxid = gxact->fxid; + xid = XidFromFullTransactionId(fxid); /* * Read and validate 2PC state data. State data will typically be stored @@ -1517,7 +1528,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit) * to disk if for some reason they have lived for a long time. */ if (gxact->ondisk) - buf = ReadTwoPhaseFile(xid, false); + buf = ReadTwoPhaseFile(fxid, false); else XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL); @@ -1636,11 +1647,11 @@ FinishPreparedTransaction(const char *gid, bool isCommit) /* And now do the callbacks */ if (isCommit) - ProcessRecords(bufptr, xid, twophase_postcommit_callbacks); + ProcessRecords(bufptr, fxid, twophase_postcommit_callbacks); else - ProcessRecords(bufptr, xid, twophase_postabort_callbacks); + ProcessRecords(bufptr, fxid, twophase_postabort_callbacks); - PredicateLockTwoPhaseFinish(xid, isCommit); + PredicateLockTwoPhaseFinish(fxid, isCommit); /* * Read this value while holding the two-phase lock, as the on-disk 2PC @@ -1664,7 +1675,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit) * And now we can clean up any files we may have left. */ if (ondisk) - RemoveTwoPhaseFile(xid, true); + RemoveTwoPhaseFile(fxid, true); MyLockedGxact = NULL; @@ -1677,7 +1688,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit) * Scan 2PC state data in memory and call the indicated callbacks for each 2PC record. */ static void -ProcessRecords(char *bufptr, TransactionId xid, +ProcessRecords(char *bufptr, FullTransactionId fxid, const TwoPhaseCallback callbacks[]) { for (;;) @@ -1691,24 +1702,28 @@ ProcessRecords(char *bufptr, TransactionId xid, bufptr += MAXALIGN(sizeof(TwoPhaseRecordOnDisk)); if (callbacks[record->rmid] != NULL) - callbacks[record->rmid] (xid, record->info, bufptr, record->len); + callbacks[record->rmid] (fxid, record->info, bufptr, record->len); bufptr += MAXALIGN(record->len); } } /* - * Remove the 2PC file for the specified XID. + * Remove the 2PC file. * * If giveWarning is false, do not complain about file-not-present; * this is an expected case during WAL replay. + * + * This routine is used at early stages at recovery where future and + * past orphaned files are checked, hence the FullTransactionId to build + * a complete file name fit for the removal. */ static void -RemoveTwoPhaseFile(TransactionId xid, bool giveWarning) +RemoveTwoPhaseFile(FullTransactionId fxid, bool giveWarning) { char path[MAXPGPATH]; - TwoPhaseFilePath(path, xid); + TwoPhaseFilePath(path, fxid); if (unlink(path)) if (errno != ENOENT || giveWarning) ereport(WARNING, @@ -1723,7 +1738,7 @@ RemoveTwoPhaseFile(TransactionId xid, bool giveWarning) * Note: content and len don't include CRC. */ static void -RecreateTwoPhaseFile(TransactionId xid, void *content, int len) +RecreateTwoPhaseFile(FullTransactionId fxid, void *content, int len) { char path[MAXPGPATH]; pg_crc32c statefile_crc; @@ -1734,7 +1749,7 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len) COMP_CRC32C(statefile_crc, content, len); FIN_CRC32C(statefile_crc); - TwoPhaseFilePath(path, xid); + TwoPhaseFilePath(path, fxid); fd = OpenTransientFile(path, O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY); @@ -1846,7 +1861,7 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) int len; XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, &len); - RecreateTwoPhaseFile(gxact->xid, buf, len); + RecreateTwoPhaseFile(gxact->fxid, buf, len); gxact->ondisk = true; gxact->prepare_start_lsn = InvalidXLogRecPtr; gxact->prepare_end_lsn = InvalidXLogRecPtr; @@ -1897,19 +1912,17 @@ restoreTwoPhaseData(void) if (strlen(clde->d_name) == 16 && strspn(clde->d_name, "0123456789ABCDEF") == 16) { - TransactionId xid; FullTransactionId fxid; char *buf; fxid = FullTransactionIdFromU64(strtou64(clde->d_name, NULL, 16)); - xid = XidFromFullTransactionId(fxid); - buf = ProcessTwoPhaseBuffer(xid, InvalidXLogRecPtr, + buf = ProcessTwoPhaseBuffer(fxid, InvalidXLogRecPtr, true, false, false); if (buf == NULL) continue; - PrepareRedoAdd(buf, InvalidXLogRecPtr, + PrepareRedoAdd(fxid, buf, InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidRepOriginId); } } @@ -1968,9 +1981,7 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p) Assert(gxact->inredo); - xid = gxact->xid; - - buf = ProcessTwoPhaseBuffer(xid, + buf = ProcessTwoPhaseBuffer(gxact->fxid, gxact->prepare_start_lsn, gxact->ondisk, false, true); @@ -1981,6 +1992,7 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p) * OK, we think this file is valid. Incorporate xid into the * running-minimum result. */ + xid = XidFromFullTransactionId(gxact->fxid); if (TransactionIdPrecedes(xid, result)) result = xid; @@ -2036,15 +2048,12 @@ StandbyRecoverPreparedTransactions(void) LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { - TransactionId xid; char *buf; GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; Assert(gxact->inredo); - xid = gxact->xid; - - buf = ProcessTwoPhaseBuffer(xid, + buf = ProcessTwoPhaseBuffer(gxact->fxid, gxact->prepare_start_lsn, gxact->ondisk, true, false); if (buf != NULL) @@ -2077,16 +2086,14 @@ RecoverPreparedTransactions(void) LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { - TransactionId xid; char *buf; GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + FullTransactionId fxid = gxact->fxid; char *bufptr; TwoPhaseFileHeader *hdr; TransactionId *subxids; const char *gid; - xid = gxact->xid; - /* * Reconstruct subtrans state for the transaction --- needed because * pg_subtrans is not preserved over a restart. Note that we are @@ -2096,17 +2103,20 @@ RecoverPreparedTransactions(void) * SubTransSetParent has been set before, if the prepared transaction * generated xid assignment records. */ - buf = ProcessTwoPhaseBuffer(xid, + buf = ProcessTwoPhaseBuffer(gxact->fxid, gxact->prepare_start_lsn, gxact->ondisk, true, false); if (buf == NULL) continue; ereport(LOG, - (errmsg("recovering prepared transaction %u from shared memory", xid))); + (errmsg("recovering prepared transaction %u of epoch %u from shared memory", + XidFromFullTransactionId(gxact->fxid), + EpochFromFullTransactionId(gxact->fxid)))); hdr = (TwoPhaseFileHeader *) buf; - Assert(TransactionIdEquals(hdr->xid, xid)); + Assert(TransactionIdEquals(hdr->xid, + XidFromFullTransactionId(gxact->fxid))); bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader)); gid = (const char *) bufptr; bufptr += MAXALIGN(hdr->gidlen); @@ -2122,7 +2132,7 @@ RecoverPreparedTransactions(void) * Recreate its GXACT and dummy PGPROC. But, check whether it was * added in redo and already has a shmem entry for it. */ - MarkAsPreparingGuts(gxact, xid, gid, + MarkAsPreparingGuts(gxact, gxact->fxid, gid, hdr->prepared_at, hdr->owner, hdr->database); @@ -2137,7 +2147,7 @@ RecoverPreparedTransactions(void) /* * Recover other state (notably locks) using resource managers. */ - ProcessRecords(bufptr, xid, twophase_recover_callbacks); + ProcessRecords(bufptr, fxid, twophase_recover_callbacks); /* * Release locks held by the standby process after we process each @@ -2145,7 +2155,7 @@ RecoverPreparedTransactions(void) * additional locks at any one time. */ if (InHotStandby) - StandbyReleaseLockTree(xid, hdr->nsubxacts, subxids); + StandbyReleaseLockTree(hdr->xid, hdr->nsubxacts, subxids); /* * We're done with recovering this transaction. Clear MyLockedGxact, @@ -2164,7 +2174,7 @@ RecoverPreparedTransactions(void) /* * ProcessTwoPhaseBuffer * - * Given a transaction id, read it either from disk or read it directly + * Given a FullTransactionId, read it either from disk or read it directly * via shmem xlog record pointer using the provided "prepare_start_lsn". * * If setParent is true, set up subtransaction parent linkages. @@ -2173,13 +2183,12 @@ RecoverPreparedTransactions(void) * value scanned. */ static char * -ProcessTwoPhaseBuffer(TransactionId xid, +ProcessTwoPhaseBuffer(FullTransactionId fxid, XLogRecPtr prepare_start_lsn, bool fromdisk, bool setParent, bool setNextXid) { FullTransactionId nextXid = TransamVariables->nextXid; - TransactionId origNextXid = XidFromFullTransactionId(nextXid); TransactionId *subxids; char *buf; TwoPhaseFileHeader *hdr; @@ -2191,41 +2200,46 @@ ProcessTwoPhaseBuffer(TransactionId xid, Assert(prepare_start_lsn != InvalidXLogRecPtr); /* Already processed? */ - if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid)) + if (TransactionIdDidCommit(XidFromFullTransactionId(fxid)) || + TransactionIdDidAbort(XidFromFullTransactionId(fxid))) { if (fromdisk) { ereport(WARNING, - (errmsg("removing stale two-phase state file for transaction %u", - xid))); - RemoveTwoPhaseFile(xid, true); + (errmsg("removing stale two-phase state file for transaction %u of epoch %u", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)))); + RemoveTwoPhaseFile(fxid, true); } else { ereport(WARNING, - (errmsg("removing stale two-phase state from memory for transaction %u", - xid))); - PrepareRedoRemove(xid, true); + (errmsg("removing stale two-phase state from memory for transaction %u of epoch %u", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)))); + PrepareRedoRemoveFull(fxid, true); } return NULL; } /* Reject XID if too new */ - if (TransactionIdFollowsOrEquals(xid, origNextXid)) + if (FullTransactionIdFollowsOrEquals(fxid, nextXid)) { if (fromdisk) { ereport(WARNING, - (errmsg("removing future two-phase state file for transaction %u", - xid))); - RemoveTwoPhaseFile(xid, true); + (errmsg("removing future two-phase state file for transaction %u of epoch %u", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)))); + RemoveTwoPhaseFile(fxid, true); } else { ereport(WARNING, - (errmsg("removing future two-phase state from memory for transaction %u", - xid))); - PrepareRedoRemove(xid, true); + (errmsg("removing future two-phase state from memory for transaction %u of epoch %u", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)))); + PrepareRedoRemoveFull(fxid, true); } return NULL; } @@ -2233,7 +2247,7 @@ ProcessTwoPhaseBuffer(TransactionId xid, if (fromdisk) { /* Read and validate file */ - buf = ReadTwoPhaseFile(xid, false); + buf = ReadTwoPhaseFile(fxid, false); } else { @@ -2243,18 +2257,20 @@ ProcessTwoPhaseBuffer(TransactionId xid, /* Deconstruct header */ hdr = (TwoPhaseFileHeader *) buf; - if (!TransactionIdEquals(hdr->xid, xid)) + if (!TransactionIdEquals(hdr->xid, XidFromFullTransactionId(fxid))) { if (fromdisk) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("corrupted two-phase state file for transaction %u", - xid))); + errmsg("corrupted two-phase state file for transaction %u of epoch %u", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)))); else ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("corrupted two-phase state in memory for transaction %u", - xid))); + errmsg("corrupted two-phase state in memory for transaction %u of epoch %u", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)))); } /* @@ -2268,14 +2284,14 @@ ProcessTwoPhaseBuffer(TransactionId xid, { TransactionId subxid = subxids[i]; - Assert(TransactionIdFollows(subxid, xid)); + Assert(TransactionIdFollows(subxid, XidFromFullTransactionId(fxid))); /* update nextXid if needed */ if (setNextXid) AdvanceNextFullTransactionIdPastXid(subxid); if (setParent) - SubTransSetParent(subxid, xid); + SubTransSetParent(subxid, XidFromFullTransactionId(fxid)); } return buf; @@ -2286,7 +2302,7 @@ ProcessTwoPhaseBuffer(TransactionId xid, * RecordTransactionCommitPrepared * * This is basically the same as RecordTransactionCommit (q.v. if you change - * this function): in particular, we must set DELAY_CHKPT_START to avoid a + * this function): in particular, we must set DELAY_CHKPT_IN_COMMIT to avoid a * race condition. * * We know the transaction made at least one XLOG entry (its PREPARE), @@ -2306,7 +2322,7 @@ RecordTransactionCommitPrepared(TransactionId xid, const char *gid) { XLogRecPtr recptr; - TimestampTz committs = GetCurrentTimestamp(); + TimestampTz committs; bool replorigin; /* @@ -2319,8 +2335,24 @@ RecordTransactionCommitPrepared(TransactionId xid, START_CRIT_SECTION(); /* See notes in RecordTransactionCommit */ - Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); - MyProc->delayChkptFlags |= DELAY_CHKPT_START; + Assert((MyProc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0); + MyProc->delayChkptFlags |= DELAY_CHKPT_IN_COMMIT; + + /* + * Ensures the DELAY_CHKPT_IN_COMMIT flag write is globally visible before + * commit time is written. + */ + pg_write_barrier(); + + /* + * Note it is important to set committs value after marking ourselves as + * in the commit critical section (DELAY_CHKPT_IN_COMMIT). This is because + * we want to ensure all transactions that have acquired commit timestamp + * are finished before we allow the logical replication client to advance + * its xid which is used to hold back dead rows for conflict detection. + * See comments atop worker.c. + */ + committs = GetCurrentTimestamp(); /* * Emit the XLOG commit record. Note that we mark 2PC commits as @@ -2369,7 +2401,7 @@ RecordTransactionCommitPrepared(TransactionId xid, TransactionIdCommitTree(xid, nchildren, children); /* Checkpoint can proceed now */ - MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + MyProc->delayChkptFlags &= ~DELAY_CHKPT_IN_COMMIT; END_CRIT_SECTION(); @@ -2466,8 +2498,9 @@ RecordTransactionAbortPrepared(TransactionId xid, * data, the entry is marked as located on disk. */ void -PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, - XLogRecPtr end_lsn, RepOriginId origin_id) +PrepareRedoAdd(FullTransactionId fxid, char *buf, + XLogRecPtr start_lsn, XLogRecPtr end_lsn, + RepOriginId origin_id) { TwoPhaseFileHeader *hdr = (TwoPhaseFileHeader *) buf; char *bufptr; @@ -2477,6 +2510,13 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE)); Assert(RecoveryInProgress()); + if (!FullTransactionIdIsValid(fxid)) + { + Assert(InRecovery); + fxid = FullTransactionIdFromAllowableAt(TransamVariables->nextXid, + hdr->xid); + } + bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader)); gid = (const char *) bufptr; @@ -2505,14 +2545,15 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, { char path[MAXPGPATH]; - TwoPhaseFilePath(path, hdr->xid); + Assert(InRecovery); + TwoPhaseFilePath(path, fxid); if (access(path, F_OK) == 0) { ereport(reachedConsistency ? ERROR : WARNING, (errmsg("could not recover two-phase state file for transaction %u", hdr->xid), - errdetail("Two-phase state file has been found in WAL record %X/%X, but this transaction has already been restored from disk.", + errdetail("Two-phase state file has been found in WAL record %X/%08X, but this transaction has already been restored from disk.", LSN_FORMAT_ARGS(start_lsn)))); return; } @@ -2536,7 +2577,7 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, gxact->prepared_at = hdr->prepared_at; gxact->prepare_start_lsn = start_lsn; gxact->prepare_end_lsn = end_lsn; - gxact->xid = hdr->xid; + gxact->fxid = fxid; gxact->owner = hdr->owner; gxact->locking_backend = INVALID_PROC_NUMBER; gxact->valid = false; @@ -2555,11 +2596,13 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, false /* backward */ , false /* WAL */ ); } - elog(DEBUG2, "added 2PC data in shared memory for transaction %u", gxact->xid); + elog(DEBUG2, "added 2PC data in shared memory for transaction %u of epoch %u", + XidFromFullTransactionId(gxact->fxid), + EpochFromFullTransactionId(gxact->fxid)); } /* - * PrepareRedoRemove + * PrepareRedoRemoveFull * * Remove the corresponding gxact entry from TwoPhaseState. Also remove * the 2PC file if a prepared transaction was saved via an earlier checkpoint. @@ -2567,8 +2610,8 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, * Caller must hold TwoPhaseStateLock in exclusive mode, because TwoPhaseState * is updated. */ -void -PrepareRedoRemove(TransactionId xid, bool giveWarning) +static void +PrepareRedoRemoveFull(FullTransactionId fxid, bool giveWarning) { GlobalTransaction gxact = NULL; int i; @@ -2581,7 +2624,7 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning) { gxact = TwoPhaseState->prepXacts[i]; - if (gxact->xid == xid) + if (FullTransactionIdEquals(gxact->fxid, fxid)) { Assert(gxact->inredo); found = true; @@ -2598,12 +2641,28 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning) /* * And now we can clean up any files we may have left. */ - elog(DEBUG2, "removing 2PC data for transaction %u", xid); + elog(DEBUG2, "removing 2PC data for transaction %u of epoch %u ", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)); + if (gxact->ondisk) - RemoveTwoPhaseFile(xid, giveWarning); + RemoveTwoPhaseFile(fxid, giveWarning); + RemoveGXact(gxact); } +/* + * Wrapper of PrepareRedoRemoveFull(), for TransactionIds. + */ +void +PrepareRedoRemove(TransactionId xid, bool giveWarning) +{ + FullTransactionId fxid = + FullTransactionIdFromAllowableAt(TransamVariables->nextXid, xid); + + PrepareRedoRemoveFull(fxid, giveWarning); +} + /* * LookupGXact * Check if the prepared transaction with the given GID, lsn and timestamp @@ -2648,7 +2707,7 @@ LookupGXact(const char *gid, XLogRecPtr prepare_end_lsn, * between publisher and subscriber. */ if (gxact->ondisk) - buf = ReadTwoPhaseFile(gxact->xid, false); + buf = ReadTwoPhaseFile(gxact->fxid, false); else { Assert(gxact->prepare_start_lsn); diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index b885513f76541..b46e7e9c2a6b0 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -1431,10 +1431,22 @@ RecordTransactionCommit(void) * without holding the ProcArrayLock, since we're the only one * modifying it. This makes checkpoint's determination of which xacts * are delaying the checkpoint a bit fuzzy, but it doesn't matter. + * + * Note, it is important to get the commit timestamp after marking the + * transaction in the commit critical section. See + * RecordTransactionCommitPrepared. */ - Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); + Assert((MyProc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0); START_CRIT_SECTION(); - MyProc->delayChkptFlags |= DELAY_CHKPT_START; + MyProc->delayChkptFlags |= DELAY_CHKPT_IN_COMMIT; + + Assert(xactStopTimestamp == 0); + + /* + * Ensures the DELAY_CHKPT_IN_COMMIT flag write is globally visible + * before commit time is written. + */ + pg_write_barrier(); /* * Insert the commit XLOG record. @@ -1537,7 +1549,7 @@ RecordTransactionCommit(void) */ if (markXidCommitted) { - MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + MyProc->delayChkptFlags &= ~DELAY_CHKPT_IN_COMMIT; END_CRIT_SECTION(); } @@ -2515,7 +2527,7 @@ static void PrepareTransaction(void) { TransactionState s = CurrentTransactionState; - TransactionId xid = GetCurrentTransactionId(); + FullTransactionId fxid = GetCurrentFullTransactionId(); GlobalTransaction gxact; TimestampTz prepared_at; @@ -2644,7 +2656,7 @@ PrepareTransaction(void) * Reserve the GID for this transaction. This could fail if the requested * GID is invalid or already in use. */ - gxact = MarkAsPreparing(xid, prepareGID, prepared_at, + gxact = MarkAsPreparing(fxid, prepareGID, prepared_at, GetUserId(), MyDatabaseId); prepareGID = NULL; @@ -2694,7 +2706,7 @@ PrepareTransaction(void) * ProcArrayClearTransaction(). Otherwise, a GetLockConflicts() would * conclude "xact already committed or aborted" for our locks. */ - PostPrepare_Locks(xid); + PostPrepare_Locks(fxid); /* * Let others know about no transaction in progress by me. This has to be @@ -2738,9 +2750,9 @@ PrepareTransaction(void) PostPrepare_smgr(); - PostPrepare_MultiXact(xid); + PostPrepare_MultiXact(fxid); - PostPrepare_PredicateLocks(xid); + PostPrepare_PredicateLocks(fxid); ResourceOwnerRelease(TopTransactionResourceOwner, RESOURCE_RELEASE_LOCKS, @@ -6420,7 +6432,8 @@ xact_redo(XLogReaderState *record) * gxact entry. */ LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); - PrepareRedoAdd(XLogRecGetData(record), + PrepareRedoAdd(InvalidFullTransactionId, + XLogRecGetData(record), record->ReadRecPtr, record->EndRecPtr, XLogRecGetOrigin(record)); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 47ffc0a230772..b0891998b243f 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -96,6 +96,7 @@ #include "utils/guc_hooks.h" #include "utils/guc_tables.h" #include "utils/injection_point.h" +#include "utils/pgstat_internal.h" #include "utils/ps_status.h" #include "utils/relmapper.h" #include "utils/snapmgr.h" @@ -449,7 +450,6 @@ typedef struct XLogCtlData /* Protected by info_lck: */ XLogwrtRqst LogwrtRqst; XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */ - FullTransactionId ckptFullXid; /* nextXid of latest checkpoint */ XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */ XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */ @@ -1028,7 +1028,7 @@ XLogInsertRecord(XLogRecData *rdata, oldCxt = MemoryContextSwitchTo(walDebugCxt); initStringInfo(&buf); - appendStringInfo(&buf, "INSERT @ %X/%X: ", LSN_FORMAT_ARGS(EndPos)); + appendStringInfo(&buf, "INSERT @ %X/%08X: ", LSN_FORMAT_ARGS(EndPos)); /* * We have to piece together the WAL record data from the XLogRecData @@ -1092,6 +1092,9 @@ XLogInsertRecord(XLogRecData *rdata, pgWalUsage.wal_bytes += rechdr->xl_tot_len; pgWalUsage.wal_records++; pgWalUsage.wal_fpi += num_fpi; + + /* Required for the flush of pending stats WAL data */ + pgstat_report_fixed = true; } return EndPos; @@ -1549,8 +1552,8 @@ WaitXLogInsertionsToFinish(XLogRecPtr upto) if (upto > reservedUpto) { ereport(LOG, - (errmsg("request to flush past end of generated WAL; request %X/%X, current position %X/%X", - LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto)))); + errmsg("request to flush past end of generated WAL; request %X/%08X, current position %X/%08X", + LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto))); upto = reservedUpto; } @@ -1716,7 +1719,7 @@ GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli) endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]); if (expectedEndPtr != endptr) - elog(PANIC, "could not find WAL buffer for %X/%X", + elog(PANIC, "could not find WAL buffer for %X/%08X", LSN_FORMAT_ARGS(ptr)); } else @@ -1776,7 +1779,7 @@ WALReadFromBuffers(char *dstbuf, XLogRecPtr startptr, Size count, inserted = pg_atomic_read_u64(&XLogCtl->logInsertResult); if (startptr + count > inserted) ereport(ERROR, - errmsg("cannot read past end of generated WAL: requested %X/%X, current position %X/%X", + errmsg("cannot read past end of generated WAL: requested %X/%08X, current position %X/%08X", LSN_FORMAT_ARGS(startptr + count), LSN_FORMAT_ARGS(inserted))); @@ -2109,6 +2112,12 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) LWLockRelease(WALWriteLock); pgWalUsage.wal_buffers_full++; TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE(); + + /* + * Required for the flush of pending stats WAL data, per + * update of pgWalUsage. + */ + pgstat_report_fixed = true; } } } @@ -2281,7 +2290,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) #ifdef WAL_DEBUG if (XLOG_DEBUG && npages > 0) { - elog(DEBUG1, "initialized %d pages, up to %X/%X", + elog(DEBUG1, "initialized %d pages, up to %X/%08X", npages, LSN_FORMAT_ARGS(NewPageEndPtr)); } #endif @@ -2346,25 +2355,6 @@ check_wal_segment_size(int *newval, void **extra, GucSource source) return true; } -/* - * GUC check_hook for max_slot_wal_keep_size - * - * We don't allow the value of max_slot_wal_keep_size other than -1 during the - * binary upgrade. See start_postmaster() in pg_upgrade for more details. - */ -bool -check_max_slot_wal_keep_size(int *newval, void **extra, GucSource source) -{ - if (IsBinaryUpgrade && *newval != -1) - { - GUC_check_errdetail("\"%s\" must be set to -1 during binary upgrade mode.", - "max_slot_wal_keep_size"); - return false; - } - - return true; -} - /* * At a checkpoint, how many WAL segments to recycle as preallocated future * XLOG segments? Returns the highest segment that should be preallocated. @@ -2492,7 +2482,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) XLogRecPtr EndPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[curridx]); if (LogwrtResult.Write >= EndPtr) - elog(PANIC, "xlog write request %X/%X is past end of log %X/%X", + elog(PANIC, "xlog write request %X/%08X is past end of log %X/%08X", LSN_FORMAT_ARGS(LogwrtResult.Write), LSN_FORMAT_ARGS(EndPtr)); @@ -2892,7 +2882,7 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI); if (!force && newMinRecoveryPoint < lsn) elog(WARNING, - "xlog min recovery request %X/%X is past current point %X/%X", + "xlog min recovery request %X/%08X is past current point %X/%08X", LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint)); /* update control file */ @@ -2905,9 +2895,9 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) LocalMinRecoveryPointTLI = newMinRecoveryPointTLI; ereport(DEBUG2, - (errmsg_internal("updated min recovery point to %X/%X on timeline %u", - LSN_FORMAT_ARGS(newMinRecoveryPoint), - newMinRecoveryPointTLI))); + errmsg_internal("updated min recovery point to %X/%08X on timeline %u", + LSN_FORMAT_ARGS(newMinRecoveryPoint), + newMinRecoveryPointTLI)); } } LWLockRelease(ControlFileLock); @@ -2945,7 +2935,7 @@ XLogFlush(XLogRecPtr record) #ifdef WAL_DEBUG if (XLOG_DEBUG) - elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X", + elog(LOG, "xlog flush request %X/%08X; write %X/%08X; flush %X/%08X", LSN_FORMAT_ARGS(record), LSN_FORMAT_ARGS(LogwrtResult.Write), LSN_FORMAT_ARGS(LogwrtResult.Flush)); @@ -3078,7 +3068,7 @@ XLogFlush(XLogRecPtr record) */ if (LogwrtResult.Flush < record) elog(ERROR, - "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X", + "xlog flush request %X/%08X is not satisfied --- flushed only to %X/%08X", LSN_FORMAT_ARGS(record), LSN_FORMAT_ARGS(LogwrtResult.Flush)); } @@ -3205,7 +3195,7 @@ XLogBackgroundFlush(void) #ifdef WAL_DEBUG if (XLOG_DEBUG) - elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X", + elog(LOG, "xlog bg flush request write %X/%08X; flush: %X/%08X, current is write %X/%08X; flush %X/%08X", LSN_FORMAT_ARGS(WriteRqst.Write), LSN_FORMAT_ARGS(WriteRqst.Flush), LSN_FORMAT_ARGS(LogwrtResult.Write), @@ -5763,7 +5753,6 @@ StartupXLOG(void) SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true); SetCommitTsLimit(checkPoint.oldestCommitTsXid, checkPoint.newestCommitTsXid); - XLogCtl->ckptFullXid = checkPoint.nextXid; /* * Clear out any old relcache cache files. This is *necessary* if we do @@ -6505,7 +6494,7 @@ PerformRecoveryXLogAction(void) else { RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY | - CHECKPOINT_IMMEDIATE | + CHECKPOINT_FAST | CHECKPOINT_WAIT); } @@ -6814,7 +6803,7 @@ ShutdownXLOG(int code, Datum arg) WalSndWaitStopping(); if (RecoveryInProgress()) - CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FAST); else { /* @@ -6826,7 +6815,7 @@ ShutdownXLOG(int code, Datum arg) if (XLogArchivingActive()) RequestXLogSwitch(false); - CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FAST); } } @@ -6842,24 +6831,24 @@ LogCheckpointStart(int flags, bool restartpoint) (errmsg("restartpoint starting:%s%s%s%s%s%s%s%s", (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "", - (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", + (flags & CHECKPOINT_FAST) ? " fast" : "", (flags & CHECKPOINT_FORCE) ? " force" : "", (flags & CHECKPOINT_WAIT) ? " wait" : "", (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "", (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "", - (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : ""))); + (flags & CHECKPOINT_FLUSH_UNLOGGED) ? " flush-unlogged" : ""))); else ereport(LOG, /* translator: the placeholders show checkpoint options */ (errmsg("checkpoint starting:%s%s%s%s%s%s%s%s", (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "", - (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", + (flags & CHECKPOINT_FAST) ? " fast" : "", (flags & CHECKPOINT_FORCE) ? " force" : "", (flags & CHECKPOINT_WAIT) ? " wait" : "", (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "", (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "", - (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : ""))); + (flags & CHECKPOINT_FLUSH_UNLOGGED) ? " flush-unlogged" : ""))); } /* @@ -6921,7 +6910,7 @@ LogCheckpointEnd(bool restartpoint) "%d removed, %d recycled; write=%ld.%03d s, " "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, " "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, " - "estimate=%d kB; lsn=%X/%X, redo lsn=%X/%X", + "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X", CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_slru_written, @@ -6945,7 +6934,7 @@ LogCheckpointEnd(bool restartpoint) "%d removed, %d recycled; write=%ld.%03d s, " "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, " "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, " - "estimate=%d kB; lsn=%X/%X, redo lsn=%X/%X", + "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X", CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_slru_written, @@ -7042,12 +7031,12 @@ update_checkpoint_display(int flags, bool restartpoint, bool reset) * flags is a bitwise OR of the following: * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. - * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, - * ignoring checkpoint_completion_target parameter. + * CHECKPOINT_FAST: finish the checkpoint ASAP, ignoring + * checkpoint_completion_target parameter. * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or * CHECKPOINT_END_OF_RECOVERY). - * CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables. + * CHECKPOINT_FLUSH_UNLOGGED: also flush buffers of unlogged tables. * * Note: flags contains other bits, of interest here only for logging purposes. * In particular note that this routine is synchronous and does not pay @@ -7142,7 +7131,7 @@ CreateCheckPoint(int flags) * starting snapshot of locks and transactions. */ if (!shutdown && XLogStandbyInfoActive()) - checkPoint.oldestActiveXid = GetOldestActiveTransactionId(); + checkPoint.oldestActiveXid = GetOldestActiveTransactionId(false, true); else checkPoint.oldestActiveXid = InvalidTransactionId; @@ -7456,11 +7445,6 @@ CreateCheckPoint(int flags) UpdateControlFile(); LWLockRelease(ControlFileLock); - /* Update shared-memory copy of checkpoint XID/epoch */ - SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->ckptFullXid = checkPoint.nextXid; - SpinLockRelease(&XLogCtl->info_lck); - /* * We are now done with critical updates; no need for system panic if we * have trouble while fooling with old log segments. @@ -7641,7 +7625,7 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, if (!RecoveryInProgress()) elog(ERROR, "can only be used at end of recovery"); if (pagePtr % XLOG_BLCKSZ != 0) - elog(ERROR, "invalid position for missing continuation record %X/%X", + elog(ERROR, "invalid position for missing continuation record %X/%08X", LSN_FORMAT_ARGS(pagePtr)); /* The current WAL insert position should be right after the page header */ @@ -7652,7 +7636,7 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, startPos += SizeOfXLogShortPHD; recptr = GetXLogInsertRecPtr(); if (recptr != startPos) - elog(ERROR, "invalid WAL insert position %X/%X for OVERWRITE_CONTRECORD", + elog(ERROR, "invalid WAL insert position %X/%08X for OVERWRITE_CONTRECORD", LSN_FORMAT_ARGS(recptr)); START_CRIT_SECTION(); @@ -7682,7 +7666,7 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, /* check that the record was inserted to the right place */ if (ProcLastRecPtr != startPos) - elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%X", + elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%08X", LSN_FORMAT_ARGS(ProcLastRecPtr)); XLogFlush(recptr); @@ -7751,8 +7735,7 @@ RecoveryRestartPoint(const CheckPoint *checkPoint, XLogReaderState *record) if (XLogHaveInvalidPages()) { elog(DEBUG2, - "could not record restart point at %X/%X because there " - "are unresolved references to invalid pages", + "could not record restart point at %X/%08X because there are unresolved references to invalid pages", LSN_FORMAT_ARGS(checkPoint->redo)); return; } @@ -7832,8 +7815,8 @@ CreateRestartPoint(int flags) lastCheckPoint.redo <= ControlFile->checkPointCopy.redo) { ereport(DEBUG2, - (errmsg_internal("skipping restartpoint, already performed at %X/%X", - LSN_FORMAT_ARGS(lastCheckPoint.redo)))); + errmsg_internal("skipping restartpoint, already performed at %X/%08X", + LSN_FORMAT_ARGS(lastCheckPoint.redo))); UpdateMinRecoveryPoint(InvalidXLogRecPtr, true); if (flags & CHECKPOINT_IS_SHUTDOWN) @@ -8017,10 +8000,10 @@ CreateRestartPoint(int flags) xtime = GetLatestXTime(); ereport((log_checkpoints ? LOG : DEBUG2), - (errmsg("recovery restart point at %X/%X", - LSN_FORMAT_ARGS(lastCheckPoint.redo)), - xtime ? errdetail("Last completed transaction was at log time %s.", - timestamptz_to_str(xtime)) : 0)); + errmsg("recovery restart point at %X/%08X", + LSN_FORMAT_ARGS(lastCheckPoint.redo)), + xtime ? errdetail("Last completed transaction was at log time %s.", + timestamptz_to_str(xtime)) : 0); /* * Finally, execute archive_cleanup_command, if any. @@ -8151,17 +8134,19 @@ KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo) XLByteToSeg(recptr, currSegNo, wal_segment_size); segno = currSegNo; - /* - * Calculate how many segments are kept by slots first, adjusting for - * max_slot_wal_keep_size. - */ + /* Calculate how many segments are kept by slots. */ keep = XLogGetReplicationSlotMinimumLSN(); if (keep != InvalidXLogRecPtr && keep < recptr) { XLByteToSeg(keep, segno, wal_segment_size); - /* Cap by max_slot_wal_keep_size ... */ - if (max_slot_wal_keep_size_mb >= 0) + /* + * Account for max_slot_wal_keep_size to avoid keeping more than + * configured. However, don't do that during a binary upgrade: if + * slots were to be invalidated because of this, it would not be + * possible to preserve logical ones during the upgrade. + */ + if (max_slot_wal_keep_size_mb >= 0 && !IsBinaryUpgrade) { uint64 slot_keep_segs; @@ -8281,8 +8266,8 @@ XLogRestorePoint(const char *rpName) RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT); ereport(LOG, - (errmsg("restore point \"%s\" created at %X/%X", - rpName, LSN_FORMAT_ARGS(RecPtr)))); + errmsg("restore point \"%s\" created at %X/%08X", + rpName, LSN_FORMAT_ARGS(RecPtr))); return RecPtr; } @@ -8534,11 +8519,6 @@ xlog_redo(XLogReaderState *record) ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; LWLockRelease(ControlFileLock); - /* Update shared-memory copy of checkpoint XID/epoch */ - SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->ckptFullXid = checkPoint.nextXid; - SpinLockRelease(&XLogCtl->info_lck); - /* * We should've already switched to the new TLI before replaying this * record. @@ -8595,11 +8575,6 @@ xlog_redo(XLogReaderState *record) ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; LWLockRelease(ControlFileLock); - /* Update shared-memory copy of checkpoint XID/epoch */ - SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->ckptFullXid = checkPoint.nextXid; - SpinLockRelease(&XLogCtl->info_lck); - /* TLI should not change in an on-line checkpoint */ (void) GetCurrentReplayRecPtr(&replayTLI); if (checkPoint.ThisTimeLineID != replayTLI) @@ -8947,9 +8922,8 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) * backup state and tablespace map. * * Input parameters are "state" (the backup state), "fast" (if true, we do - * the checkpoint in immediate mode to make it faster), and "tablespaces" - * (if non-NULL, indicates a list of tablespaceinfo structs describing the - * cluster's tablespaces.). + * the checkpoint in fast mode), and "tablespaces" (if non-NULL, indicates a + * list of tablespaceinfo structs describing the cluster's tablespaces.). * * The tablespace map contents are appended to passed-in parameter * tablespace_map and the caller is responsible for including it in the backup @@ -9077,11 +9051,11 @@ do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces, * during recovery means that checkpointer is running, we can use * RequestCheckpoint() to establish a restartpoint. * - * We use CHECKPOINT_IMMEDIATE only if requested by user (via - * passing fast = true). Otherwise this can take awhile. + * We use CHECKPOINT_FAST only if requested by user (via passing + * fast = true). Otherwise this can take awhile. */ RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | - (fast ? CHECKPOINT_IMMEDIATE : 0)); + (fast ? CHECKPOINT_FAST : 0)); /* * Now we need to fetch the checkpoint record location, and also diff --git a/src/backend/access/transam/xlogbackup.c b/src/backend/access/transam/xlogbackup.c index 342590e0a46d3..cda4b38b7d67d 100644 --- a/src/backend/access/transam/xlogbackup.c +++ b/src/backend/access/transam/xlogbackup.c @@ -42,7 +42,7 @@ build_backup_content(BackupState *state, bool ishistoryfile) XLByteToSeg(state->startpoint, startsegno, wal_segment_size); XLogFileName(startxlogfile, state->starttli, startsegno, wal_segment_size); - appendStringInfo(result, "START WAL LOCATION: %X/%X (file %s)\n", + appendStringInfo(result, "START WAL LOCATION: %X/%08X (file %s)\n", LSN_FORMAT_ARGS(state->startpoint), startxlogfile); if (ishistoryfile) @@ -52,11 +52,11 @@ build_backup_content(BackupState *state, bool ishistoryfile) XLByteToSeg(state->stoppoint, stopsegno, wal_segment_size); XLogFileName(stopxlogfile, state->stoptli, stopsegno, wal_segment_size); - appendStringInfo(result, "STOP WAL LOCATION: %X/%X (file %s)\n", + appendStringInfo(result, "STOP WAL LOCATION: %X/%08X (file %s)\n", LSN_FORMAT_ARGS(state->stoppoint), stopxlogfile); } - appendStringInfo(result, "CHECKPOINT LOCATION: %X/%X\n", + appendStringInfo(result, "CHECKPOINT LOCATION: %X/%08X\n", LSN_FORMAT_ARGS(state->checkpointloc)); appendStringInfoString(result, "BACKUP METHOD: streamed\n"); appendStringInfo(result, "BACKUP FROM: %s\n", @@ -81,7 +81,7 @@ build_backup_content(BackupState *state, bool ishistoryfile) Assert(XLogRecPtrIsInvalid(state->istartpoint) == (state->istarttli == 0)); if (!XLogRecPtrIsInvalid(state->istartpoint)) { - appendStringInfo(result, "INCREMENTAL FROM LSN: %X/%X\n", + appendStringInfo(result, "INCREMENTAL FROM LSN: %X/%08X\n", LSN_FORMAT_ARGS(state->istartpoint)); appendStringInfo(result, "INCREMENTAL FROM TLI: %u\n", state->istarttli); diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 5ee9d0b028eae..c7571429e8e97 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -529,6 +529,18 @@ XLogInsert(RmgrId rmid, uint8 info) return EndPos; } +/* + * Simple wrapper to XLogInsert to insert a WAL record with elementary + * contents (only an int64 is supported as value currently). + */ +XLogRecPtr +XLogSimpleInsertInt64(RmgrId rmid, uint8 info, int64 value) +{ + XLogBeginInsert(); + XLogRegisterData(&value, sizeof(value)); + return XLogInsert(rmid, info); +} + /* * Assemble a WAL record from the registered data and buffers into an * XLogRecData chain, ready for insertion with XLogInsertRecord(). diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c index 7735562db01d1..ed3aacabc986e 100644 --- a/src/backend/access/transam/xlogprefetcher.c +++ b/src/backend/access/transam/xlogprefetcher.c @@ -546,7 +546,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing all readahead until %X/%X is replayed due to possible TLI change", + "suppressing all readahead until %X/%08X is replayed due to possible TLI change", LSN_FORMAT_ARGS(record->lsn)); #endif @@ -579,7 +579,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing prefetch in database %u until %X/%X is replayed due to raw file copy", + "suppressing prefetch in database %u until %X/%08X is replayed due to raw file copy", rlocator.dbOid, LSN_FORMAT_ARGS(record->lsn)); #endif @@ -607,7 +607,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing prefetch in relation %u/%u/%u until %X/%X is replayed, which creates the relation", + "suppressing prefetch in relation %u/%u/%u until %X/%08X is replayed, which creates the relation", xlrec->rlocator.spcOid, xlrec->rlocator.dbOid, xlrec->rlocator.relNumber, @@ -630,7 +630,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, which truncates the relation", + "suppressing prefetch in relation %u/%u/%u from block %u until %X/%08X is replayed, which truncates the relation", xlrec->rlocator.spcOid, xlrec->rlocator.dbOid, xlrec->rlocator.relNumber, @@ -729,7 +729,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing all prefetch in relation %u/%u/%u until %X/%X is replayed, because the relation does not exist on disk", + "suppressing all prefetch in relation %u/%u/%u until %X/%08X is replayed, because the relation does not exist on disk", reln->smgr_rlocator.locator.spcOid, reln->smgr_rlocator.locator.dbOid, reln->smgr_rlocator.locator.relNumber, @@ -750,7 +750,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, because the relation is too small", + "suppressing prefetch in relation %u/%u/%u from block %u until %X/%08X is replayed, because the relation is too small", reln->smgr_rlocator.locator.spcOid, reln->smgr_rlocator.locator.dbOid, reln->smgr_rlocator.locator.relNumber, @@ -928,7 +928,7 @@ XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileLocator rlocator, { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (blocks >= %u filtered)", + "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%08X is replayed (blocks >= %u filtered)", rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blockno, LSN_FORMAT_ARGS(filter->filter_until_replayed), filter->filter_from_block); @@ -944,7 +944,7 @@ XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileLocator rlocator, { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (whole database)", + "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%08X is replayed (whole database)", rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blockno, LSN_FORMAT_ARGS(filter->filter_until_replayed)); #endif diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 2790ade1f91e8..dcc8d4f9c1b0b 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -617,7 +617,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) } else if (targetRecOff < pageHeaderSize) { - report_invalid_record(state, "invalid record offset at %X/%X: expected at least %u, got %u", + report_invalid_record(state, "invalid record offset at %X/%08X: expected at least %u, got %u", LSN_FORMAT_ARGS(RecPtr), pageHeaderSize, targetRecOff); goto err; @@ -626,7 +626,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) if ((((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) && targetRecOff == pageHeaderSize) { - report_invalid_record(state, "contrecord is requested by %X/%X", + report_invalid_record(state, "contrecord is requested by %X/%08X", LSN_FORMAT_ARGS(RecPtr)); goto err; } @@ -667,7 +667,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) if (total_len < SizeOfXLogRecord) { report_invalid_record(state, - "invalid record length at %X/%X: expected at least %u, got %u", + "invalid record length at %X/%08X: expected at least %u, got %u", LSN_FORMAT_ARGS(RecPtr), (uint32) SizeOfXLogRecord, total_len); goto err; @@ -723,11 +723,12 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) /* Calculate pointer to beginning of next page */ targetPagePtr += XLOG_BLCKSZ; - /* Wait for the next page to become available */ - readOff = ReadPageInternal(state, targetPagePtr, - Min(total_len - gotlen + SizeOfXLogShortPHD, - XLOG_BLCKSZ)); - + /* + * Read the page header before processing the record data, so we + * can handle the case where the previous record ended as being a + * partial one. + */ + readOff = ReadPageInternal(state, targetPagePtr, SizeOfXLogShortPHD); if (readOff == XLREAD_WOULDBLOCK) return XLREAD_WOULDBLOCK; else if (readOff < 0) @@ -756,7 +757,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD)) { report_invalid_record(state, - "there is no contrecord flag at %X/%X", + "there is no contrecord flag at %X/%08X", LSN_FORMAT_ARGS(RecPtr)); goto err; } @@ -769,13 +770,22 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) total_len != (pageHeader->xlp_rem_len + gotlen)) { report_invalid_record(state, - "invalid contrecord length %u (expected %lld) at %X/%X", + "invalid contrecord length %u (expected %lld) at %X/%08X", pageHeader->xlp_rem_len, ((long long) total_len) - gotlen, LSN_FORMAT_ARGS(RecPtr)); goto err; } + /* Wait for the next page to become available */ + readOff = ReadPageInternal(state, targetPagePtr, + Min(total_len - gotlen + SizeOfXLogShortPHD, + XLOG_BLCKSZ)); + if (readOff == XLREAD_WOULDBLOCK) + return XLREAD_WOULDBLOCK; + else if (readOff < 0) + goto err; + /* Append the continuation from this page to the buffer */ pageHeaderSize = XLogPageHeaderSize(pageHeader); @@ -1132,7 +1142,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, if (record->xl_tot_len < SizeOfXLogRecord) { report_invalid_record(state, - "invalid record length at %X/%X: expected at least %u, got %u", + "invalid record length at %X/%08X: expected at least %u, got %u", LSN_FORMAT_ARGS(RecPtr), (uint32) SizeOfXLogRecord, record->xl_tot_len); return false; @@ -1140,7 +1150,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, if (!RmgrIdIsValid(record->xl_rmid)) { report_invalid_record(state, - "invalid resource manager ID %u at %X/%X", + "invalid resource manager ID %u at %X/%08X", record->xl_rmid, LSN_FORMAT_ARGS(RecPtr)); return false; } @@ -1153,7 +1163,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, if (!(record->xl_prev < RecPtr)) { report_invalid_record(state, - "record with incorrect prev-link %X/%X at %X/%X", + "record with incorrect prev-link %X/%08X at %X/%08X", LSN_FORMAT_ARGS(record->xl_prev), LSN_FORMAT_ARGS(RecPtr)); return false; @@ -1169,7 +1179,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, if (record->xl_prev != PrevRecPtr) { report_invalid_record(state, - "record with incorrect prev-link %X/%X at %X/%X", + "record with incorrect prev-link %X/%08X at %X/%08X", LSN_FORMAT_ARGS(record->xl_prev), LSN_FORMAT_ARGS(RecPtr)); return false; @@ -1207,7 +1217,7 @@ ValidXLogRecord(XLogReaderState *state, XLogRecord *record, XLogRecPtr recptr) if (!EQ_CRC32C(record->xl_crc, crc)) { report_invalid_record(state, - "incorrect resource manager data checksum in record at %X/%X", + "incorrect resource manager data checksum in record at %X/%08X", LSN_FORMAT_ARGS(recptr)); return false; } @@ -1241,7 +1251,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); report_invalid_record(state, - "invalid magic number %04X in WAL segment %s, LSN %X/%X, offset %u", + "invalid magic number %04X in WAL segment %s, LSN %X/%08X, offset %u", hdr->xlp_magic, fname, LSN_FORMAT_ARGS(recptr), @@ -1256,7 +1266,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); report_invalid_record(state, - "invalid info bits %04X in WAL segment %s, LSN %X/%X, offset %u", + "invalid info bits %04X in WAL segment %s, LSN %X/%08X, offset %u", hdr->xlp_info, fname, LSN_FORMAT_ARGS(recptr), @@ -1298,7 +1308,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, /* hmm, first page of file doesn't have a long header? */ report_invalid_record(state, - "invalid info bits %04X in WAL segment %s, LSN %X/%X, offset %u", + "invalid info bits %04X in WAL segment %s, LSN %X/%08X, offset %u", hdr->xlp_info, fname, LSN_FORMAT_ARGS(recptr), @@ -1318,7 +1328,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); report_invalid_record(state, - "unexpected pageaddr %X/%X in WAL segment %s, LSN %X/%X, offset %u", + "unexpected pageaddr %X/%08X in WAL segment %s, LSN %X/%08X, offset %u", LSN_FORMAT_ARGS(hdr->xlp_pageaddr), fname, LSN_FORMAT_ARGS(recptr), @@ -1344,7 +1354,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); report_invalid_record(state, - "out-of-sequence timeline ID %u (after %u) in WAL segment %s, LSN %X/%X, offset %u", + "out-of-sequence timeline ID %u (after %u) in WAL segment %s, LSN %X/%08X, offset %u", hdr->xlp_tli, state->latestPageTLI, fname, @@ -1756,7 +1766,7 @@ DecodeXLogRecord(XLogReaderState *state, if (block_id <= decoded->max_block_id) { report_invalid_record(state, - "out-of-order block_id %u at %X/%X", + "out-of-order block_id %u at %X/%08X", block_id, LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; @@ -1780,14 +1790,14 @@ DecodeXLogRecord(XLogReaderState *state, if (blk->has_data && blk->data_len == 0) { report_invalid_record(state, - "BKPBLOCK_HAS_DATA set, but no data included at %X/%X", + "BKPBLOCK_HAS_DATA set, but no data included at %X/%08X", LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; } if (!blk->has_data && blk->data_len != 0) { report_invalid_record(state, - "BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%X", + "BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%08X", (unsigned int) blk->data_len, LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; @@ -1823,7 +1833,7 @@ DecodeXLogRecord(XLogReaderState *state, blk->bimg_len == BLCKSZ)) { report_invalid_record(state, - "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X", + "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%08X", (unsigned int) blk->hole_offset, (unsigned int) blk->hole_length, (unsigned int) blk->bimg_len, @@ -1839,7 +1849,7 @@ DecodeXLogRecord(XLogReaderState *state, (blk->hole_offset != 0 || blk->hole_length != 0)) { report_invalid_record(state, - "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X", + "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%08X", (unsigned int) blk->hole_offset, (unsigned int) blk->hole_length, LSN_FORMAT_ARGS(state->ReadRecPtr)); @@ -1853,7 +1863,7 @@ DecodeXLogRecord(XLogReaderState *state, blk->bimg_len == BLCKSZ) { report_invalid_record(state, - "BKPIMAGE_COMPRESSED set, but block image length %u at %X/%X", + "BKPIMAGE_COMPRESSED set, but block image length %u at %X/%08X", (unsigned int) blk->bimg_len, LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; @@ -1868,7 +1878,7 @@ DecodeXLogRecord(XLogReaderState *state, blk->bimg_len != BLCKSZ) { report_invalid_record(state, - "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_COMPRESSED set, but block image length is %u at %X/%X", + "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_COMPRESSED set, but block image length is %u at %X/%08X", (unsigned int) blk->data_len, LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; @@ -1884,7 +1894,7 @@ DecodeXLogRecord(XLogReaderState *state, if (rlocator == NULL) { report_invalid_record(state, - "BKPBLOCK_SAME_REL set but no previous rel at %X/%X", + "BKPBLOCK_SAME_REL set but no previous rel at %X/%08X", LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; } @@ -1896,7 +1906,7 @@ DecodeXLogRecord(XLogReaderState *state, else { report_invalid_record(state, - "invalid block_id %u at %X/%X", + "invalid block_id %u at %X/%08X", block_id, LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; } @@ -1963,7 +1973,7 @@ DecodeXLogRecord(XLogReaderState *state, shortdata_err: report_invalid_record(state, - "record with invalid length at %X/%X", + "record with invalid length at %X/%08X", LSN_FORMAT_ARGS(state->ReadRecPtr)); err: *errormsg = state->errormsg_buf; @@ -2073,14 +2083,14 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) !record->record->blocks[block_id].in_use) { report_invalid_record(record, - "could not restore image at %X/%X with invalid block %d specified", + "could not restore image at %X/%08X with invalid block %d specified", LSN_FORMAT_ARGS(record->ReadRecPtr), block_id); return false; } if (!record->record->blocks[block_id].has_image) { - report_invalid_record(record, "could not restore image at %X/%X with invalid state, block %d", + report_invalid_record(record, "could not restore image at %X/%08X with invalid state, block %d", LSN_FORMAT_ARGS(record->ReadRecPtr), block_id); return false; @@ -2107,7 +2117,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) bkpb->bimg_len, BLCKSZ - bkpb->hole_length) <= 0) decomp_success = false; #else - report_invalid_record(record, "could not restore image at %X/%X compressed with %s not supported by build, block %d", + report_invalid_record(record, "could not restore image at %X/%08X compressed with %s not supported by build, block %d", LSN_FORMAT_ARGS(record->ReadRecPtr), "LZ4", block_id); @@ -2124,7 +2134,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) if (ZSTD_isError(decomp_result)) decomp_success = false; #else - report_invalid_record(record, "could not restore image at %X/%X compressed with %s not supported by build, block %d", + report_invalid_record(record, "could not restore image at %X/%08X compressed with %s not supported by build, block %d", LSN_FORMAT_ARGS(record->ReadRecPtr), "zstd", block_id); @@ -2133,7 +2143,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) } else { - report_invalid_record(record, "could not restore image at %X/%X compressed with unknown method, block %d", + report_invalid_record(record, "could not restore image at %X/%08X compressed with unknown method, block %d", LSN_FORMAT_ARGS(record->ReadRecPtr), block_id); return false; @@ -2141,7 +2151,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) if (!decomp_success) { - report_invalid_record(record, "could not decompress image at %X/%X, block %d", + report_invalid_record(record, "could not decompress image at %X/%08X, block %d", LSN_FORMAT_ARGS(record->ReadRecPtr), block_id); return false; diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 6ce979f2d8bc4..e8f3ba00caae7 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -620,10 +620,10 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, * than ControlFile->checkPoint is used. */ ereport(LOG, - (errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u", - LSN_FORMAT_ARGS(RedoStartLSN), - LSN_FORMAT_ARGS(CheckPointLoc), - CheckPointTLI))); + errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u", + LSN_FORMAT_ARGS(RedoStartLSN), + LSN_FORMAT_ARGS(CheckPointLoc), + CheckPointTLI)); /* * When a backup_label file is present, we want to roll forward from @@ -636,8 +636,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); ereport(DEBUG1, - (errmsg_internal("checkpoint record is at %X/%X", - LSN_FORMAT_ARGS(CheckPointLoc)))); + errmsg_internal("checkpoint record is at %X/%08X", + LSN_FORMAT_ARGS(CheckPointLoc))); InRecovery = true; /* force recovery even if SHUTDOWNED */ /* @@ -652,23 +652,23 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID)) ereport(FATAL, - (errmsg("could not find redo location %X/%X referenced by checkpoint record at %X/%X", - LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)), - errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n" - "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" - "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", - DataDir, DataDir, DataDir, DataDir))); + errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X", + LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)), + errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n" + "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" + "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", + DataDir, DataDir, DataDir, DataDir)); } } else { ereport(FATAL, - (errmsg("could not locate required checkpoint record at %X/%X", - LSN_FORMAT_ARGS(CheckPointLoc)), - errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n" - "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" - "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", - DataDir, DataDir, DataDir, DataDir))); + errmsg("could not locate required checkpoint record at %X/%08X", + LSN_FORMAT_ARGS(CheckPointLoc)), + errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n" + "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" + "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", + DataDir, DataDir, DataDir, DataDir)); wasShutdown = false; /* keep compiler quiet */ } @@ -773,8 +773,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, */ if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint)) ereport(LOG, - (errmsg("restarting backup recovery with redo LSN %X/%X", - LSN_FORMAT_ARGS(ControlFile->backupStartPoint)))); + errmsg("restarting backup recovery with redo LSN %X/%08X", + LSN_FORMAT_ARGS(ControlFile->backupStartPoint))); /* Get the last valid checkpoint record. */ CheckPointLoc = ControlFile->checkPoint; @@ -786,8 +786,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, if (record != NULL) { ereport(DEBUG1, - (errmsg_internal("checkpoint record is at %X/%X", - LSN_FORMAT_ARGS(CheckPointLoc)))); + errmsg_internal("checkpoint record is at %X/%08X", + LSN_FORMAT_ARGS(CheckPointLoc))); } else { @@ -798,8 +798,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, * simplify processing around checkpoints. */ ereport(PANIC, - (errmsg("could not locate a valid checkpoint record at %X/%X", - LSN_FORMAT_ARGS(CheckPointLoc)))); + errmsg("could not locate a valid checkpoint record at %X/%08X", + LSN_FORMAT_ARGS(CheckPointLoc))); } memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); @@ -824,8 +824,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, recoveryTargetName))); else if (recoveryTarget == RECOVERY_TARGET_LSN) ereport(LOG, - (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"", - LSN_FORMAT_ARGS(recoveryTargetLSN)))); + errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"", + LSN_FORMAT_ARGS(recoveryTargetLSN))); else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE) ereport(LOG, (errmsg("starting point-in-time recovery to earliest consistent point"))); @@ -855,7 +855,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, (errmsg("requested timeline %u is not a child of this server's history", recoveryTargetTLI), /* translator: %s is a backup_label file or a pg_control file */ - errdetail("Latest checkpoint in file \"%s\" is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.", + errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.", haveBackupLabel ? "backup_label" : "pg_control", LSN_FORMAT_ARGS(CheckPointLoc), CheckPointTLI, @@ -870,15 +870,15 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) != ControlFile->minRecoveryPointTLI) ereport(FATAL, - (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u", - recoveryTargetTLI, - LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint), - ControlFile->minRecoveryPointTLI))); + errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u", + recoveryTargetTLI, + LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint), + ControlFile->minRecoveryPointTLI)); ereport(DEBUG1, - (errmsg_internal("redo record is at %X/%X; shutdown %s", - LSN_FORMAT_ARGS(checkPoint.redo), - wasShutdown ? "true" : "false"))); + errmsg_internal("redo record is at %X/%08X; shutdown %s", + LSN_FORMAT_ARGS(checkPoint.redo), + wasShutdown ? "true" : "false")); ereport(DEBUG1, (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u", U64FromFullTransactionId(checkPoint.nextXid), @@ -1253,14 +1253,14 @@ read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, * is pretty crude, but we are not expecting any variability in the file * format). */ - if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c", + if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c", &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n') ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); RedoStartLSN = ((uint64) hi) << 32 | lo; RedoStartTLI = tli_from_walseg; - if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c", + if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c", &hi, &lo, &ch) != 3 || ch != '\n') ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), @@ -1332,7 +1332,7 @@ read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, tli_from_file, BACKUP_LABEL_FILE))); } - if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%X\n", &hi, &lo) > 0) + if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0) ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("this is an incremental backup, not a data directory"), @@ -1722,8 +1722,8 @@ PerformWalRecovery(void) if (record->xl_rmid != RM_XLOG_ID || (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO) ereport(FATAL, - (errmsg("unexpected record type found at redo point %X/%X", - LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)))); + errmsg("unexpected record type found at redo point %X/%08X", + LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))); } else { @@ -1745,8 +1745,8 @@ PerformWalRecovery(void) RmgrStartup(); ereport(LOG, - (errmsg("redo starts at %X/%X", - LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)))); + errmsg("redo starts at %X/%08X", + LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))); /* Prepare to report progress of the redo phase. */ if (!StandbyMode) @@ -1758,7 +1758,7 @@ PerformWalRecovery(void) do { if (!StandbyMode) - ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X", + ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X", LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)); #ifdef WAL_DEBUG @@ -1767,7 +1767,7 @@ PerformWalRecovery(void) StringInfoData buf; initStringInfo(&buf); - appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ", + appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ", LSN_FORMAT_ARGS(xlogreader->ReadRecPtr), LSN_FORMAT_ARGS(xlogreader->EndRecPtr)); xlog_outrec(&buf, xlogreader); @@ -1880,9 +1880,9 @@ PerformWalRecovery(void) RmgrCleanup(); ereport(LOG, - (errmsg("redo done at %X/%X system usage: %s", - LSN_FORMAT_ARGS(xlogreader->ReadRecPtr), - pg_rusage_show(&ru0)))); + errmsg("redo done at %X/%08X system usage: %s", + LSN_FORMAT_ARGS(xlogreader->ReadRecPtr), + pg_rusage_show(&ru0))); xtime = GetLatestXTime(); if (xtime) ereport(LOG, @@ -2092,7 +2092,7 @@ xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI) memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord)); if (xlrec.overwritten_lsn != record->overwrittenRecPtr) - elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X", + elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X", LSN_FORMAT_ARGS(xlrec.overwritten_lsn), LSN_FORMAT_ARGS(record->overwrittenRecPtr)); @@ -2101,9 +2101,9 @@ xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI) missingContrecPtr = InvalidXLogRecPtr; ereport(LOG, - (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s", - LSN_FORMAT_ARGS(xlrec.overwritten_lsn), - timestamptz_to_str(xlrec.overwrite_time)))); + errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s", + LSN_FORMAT_ARGS(xlrec.overwritten_lsn), + timestamptz_to_str(xlrec.overwrite_time))); /* Verifying the record should only happen once */ record->overwrittenRecPtr = InvalidXLogRecPtr; @@ -2129,7 +2129,7 @@ xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI) backupEndPoint = lsn; } else - elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X", + elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X", LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint)); } } @@ -2224,9 +2224,9 @@ CheckRecoveryConsistency(void) backupEndRequired = false; ereport(LOG, - (errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X", - LSN_FORMAT_ARGS(saveBackupStartPoint), - LSN_FORMAT_ARGS(saveBackupEndPoint)))); + errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X", + LSN_FORMAT_ARGS(saveBackupStartPoint), + LSN_FORMAT_ARGS(saveBackupEndPoint))); } /* @@ -2255,8 +2255,8 @@ CheckRecoveryConsistency(void) reachedConsistency = true; SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT); ereport(LOG, - (errmsg("consistent recovery state reached at %X/%X", - LSN_FORMAT_ARGS(lastReplayedEndRecPtr)))); + errmsg("consistent recovery state reached at %X/%08X", + LSN_FORMAT_ARGS(lastReplayedEndRecPtr))); } /* @@ -2293,7 +2293,7 @@ rm_redo_error_callback(void *arg) xlog_block_info(&buf, record); /* translator: %s is a WAL record description */ - errcontext("WAL redo at %X/%X for %s", + errcontext("WAL redo at %X/%08X for %s", LSN_FORMAT_ARGS(record->ReadRecPtr), buf.data); @@ -2328,7 +2328,7 @@ xlog_outdesc(StringInfo buf, XLogReaderState *record) static void xlog_outrec(StringInfo buf, XLogReaderState *record) { - appendStringInfo(buf, "prev %X/%X; xid %u", + appendStringInfo(buf, "prev %X/%08X; xid %u", LSN_FORMAT_ARGS(XLogRecGetPrev(record)), XLogRecGetXid(record)); @@ -2416,10 +2416,10 @@ checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI, lsn < minRecoveryPoint && newTLI > minRecoveryPointTLI) ereport(PANIC, - (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u", - newTLI, - LSN_FORMAT_ARGS(minRecoveryPoint), - minRecoveryPointTLI))); + errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u", + newTLI, + LSN_FORMAT_ARGS(minRecoveryPoint), + minRecoveryPointTLI)); /* Looks good */ } @@ -2621,8 +2621,8 @@ recoveryStopsBefore(XLogReaderState *record) recoveryStopTime = 0; recoveryStopName[0] = '\0'; ereport(LOG, - (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"", - LSN_FORMAT_ARGS(recoveryStopLSN)))); + errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"", + LSN_FORMAT_ARGS(recoveryStopLSN))); return true; } @@ -2789,8 +2789,8 @@ recoveryStopsAfter(XLogReaderState *record) recoveryStopTime = 0; recoveryStopName[0] = '\0'; ereport(LOG, - (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"", - LSN_FORMAT_ARGS(recoveryStopLSN)))); + errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"", + LSN_FORMAT_ARGS(recoveryStopLSN))); return true; } @@ -2910,7 +2910,7 @@ getRecoveryStopReason(void) timestamptz_to_str(recoveryStopTime)); else if (recoveryTarget == RECOVERY_TARGET_LSN) snprintf(reason, sizeof(reason), - "%s LSN %X/%X\n", + "%s LSN %X/%08X\n", recoveryStopAfter ? "after" : "before", LSN_FORMAT_ARGS(recoveryStopLSN)); else if (recoveryTarget == RECOVERY_TARGET_NAME) @@ -3213,11 +3213,11 @@ ReadRecord(XLogPrefetcher *xlogprefetcher, int emode, XLogFileName(fname, xlogreader->seg.ws_tli, segno, wal_segment_size); ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr), - (errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u", - xlogreader->latestPageTLI, - fname, - LSN_FORMAT_ARGS(xlogreader->latestPagePtr), - offset))); + errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u", + xlogreader->latestPageTLI, + fname, + LSN_FORMAT_ARGS(xlogreader->latestPagePtr), + offset)); record = NULL; } @@ -3429,14 +3429,14 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, errno = save_errno; ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), (errcode_for_file_access(), - errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m", + errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m", fname, LSN_FORMAT_ARGS(targetPagePtr), readOff))); } else ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu", + errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu", fname, LSN_FORMAT_ARGS(targetPagePtr), readOff, r, (Size) XLOG_BLCKSZ))); goto next_record_is_invalid; @@ -3718,7 +3718,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, wait_time = wal_retrieve_retry_interval - TimestampDifferenceMilliseconds(last_fail_time, now); - elog(LOG, "waiting for WAL to become available at %X/%X", + elog(LOG, "waiting for WAL to become available at %X/%08X", LSN_FORMAT_ARGS(RecPtr)); /* Do background tasks that might benefit us later. */ @@ -3864,7 +3864,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, tli = tliOfPointInHistory(tliRecPtr, expectedTLEs); if (curFileTLI > 0 && tli < curFileTLI) - elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u", + elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u", LSN_FORMAT_ARGS(tliRecPtr), tli, curFileTLI); } @@ -4177,10 +4177,10 @@ rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN) if (currentTle->end < replayLSN) { ereport(LOG, - (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X", - newtarget, - replayTLI, - LSN_FORMAT_ARGS(replayLSN)))); + errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X", + newtarget, + replayTLI, + LSN_FORMAT_ARGS(replayLSN))); return false; } @@ -4760,7 +4760,7 @@ bool check_primary_slot_name(char **newval, void **extra, GucSource source) { if (*newval && strcmp(*newval, "") != 0 && - !ReplicationSlotValidateName(*newval, WARNING)) + !ReplicationSlotValidateName(*newval, false, WARNING)) return false; return true; @@ -4994,13 +4994,25 @@ check_recovery_target_timeline(char **newval, void **extra, GucSource source) rttg = RECOVERY_TARGET_TIMELINE_LATEST; else { + char *endp; + uint64 timeline; + rttg = RECOVERY_TARGET_TIMELINE_NUMERIC; errno = 0; - strtoul(*newval, NULL, 0); - if (errno == EINVAL || errno == ERANGE) + timeline = strtou64(*newval, &endp, 0); + + if (*endp != '\0' || errno == EINVAL || errno == ERANGE) + { + GUC_check_errdetail("\"%s\" is not a valid number.", + "recovery_target_timeline"); + return false; + } + + if (timeline < 1 || timeline > PG_UINT32_MAX) { - GUC_check_errdetail("\"recovery_target_timeline\" is not a valid number."); + GUC_check_errdetail("\"%s\" must be between %u and %u.", + "recovery_target_timeline", 1, UINT_MAX); return false; } } diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index c389b27f77d47..27ea52fdfee66 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -795,7 +795,7 @@ XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage, list_free_deep(timelineHistory); - elog(DEBUG3, "switched to timeline %u valid until %X/%X", + elog(DEBUG3, "switched to timeline %u valid until %X/%08X", state->currTLI, LSN_FORMAT_ARGS(state->currTLIValidUntil)); } diff --git a/src/backend/backup/backup_manifest.c b/src/backend/backup/backup_manifest.c index 22e2be37c95c3..d05252f383c80 100644 --- a/src/backend/backup/backup_manifest.c +++ b/src/backend/backup/backup_manifest.c @@ -281,7 +281,7 @@ AddWALInfoToBackupManifest(backup_manifest_info *manifest, XLogRecPtr startptr, } AppendToManifest(manifest, - "%s{ \"Timeline\": %u, \"Start-LSN\": \"%X/%X\", \"End-LSN\": \"%X/%X\" }", + "%s{ \"Timeline\": %u, \"Start-LSN\": \"%X/%08X\", \"End-LSN\": \"%X/%08X\" }", first_wal_range ? "" : ",\n", entry->tli, LSN_FORMAT_ARGS(tl_beginptr), diff --git a/src/backend/backup/basebackup_copy.c b/src/backend/backup/basebackup_copy.c index a284ce318ff7d..18b0b5a52d3f8 100644 --- a/src/backend/backup/basebackup_copy.c +++ b/src/backend/backup/basebackup_copy.c @@ -361,7 +361,7 @@ SendXlogRecPtrResult(XLogRecPtr ptr, TimeLineID tli) tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual); /* Data row */ - values[0] = CStringGetTextDatum(psprintf("%X/%X", LSN_FORMAT_ARGS(ptr))); + values[0] = CStringGetTextDatum(psprintf("%X/%08X", LSN_FORMAT_ARGS(ptr))); values[1] = Int64GetDatum(tli); do_tup_output(tstate, values, nulls); diff --git a/src/backend/backup/basebackup_incremental.c b/src/backend/backup/basebackup_incremental.c index 28491b1e0ab08..a0d48ff0fefa9 100644 --- a/src/backend/backup/basebackup_incremental.c +++ b/src/backend/backup/basebackup_incremental.c @@ -409,7 +409,7 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib, if (range->start_lsn < tlep[i]->begin) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("manifest requires WAL from initial timeline %u starting at %X/%X, but that timeline begins at %X/%X", + errmsg("manifest requires WAL from initial timeline %u starting at %X/%08X, but that timeline begins at %X/%08X", range->tli, LSN_FORMAT_ARGS(range->start_lsn), LSN_FORMAT_ARGS(tlep[i]->begin)))); @@ -419,7 +419,7 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib, if (range->start_lsn != tlep[i]->begin) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("manifest requires WAL from continuation timeline %u starting at %X/%X, but that timeline begins at %X/%X", + errmsg("manifest requires WAL from continuation timeline %u starting at %X/%08X, but that timeline begins at %X/%08X", range->tli, LSN_FORMAT_ARGS(range->start_lsn), LSN_FORMAT_ARGS(tlep[i]->begin)))); @@ -430,7 +430,7 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib, if (range->end_lsn > backup_state->startpoint) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("manifest requires WAL from final timeline %u ending at %X/%X, but this backup starts at %X/%X", + errmsg("manifest requires WAL from final timeline %u ending at %X/%08X, but this backup starts at %X/%08X", range->tli, LSN_FORMAT_ARGS(range->end_lsn), LSN_FORMAT_ARGS(backup_state->startpoint)), @@ -441,7 +441,7 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib, if (range->end_lsn != tlep[i]->end) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("manifest requires WAL from non-final timeline %u ending at %X/%X, but this server switched timelines at %X/%X", + errmsg("manifest requires WAL from non-final timeline %u ending at %X/%08X, but this server switched timelines at %X/%08X", range->tli, LSN_FORMAT_ARGS(range->end_lsn), LSN_FORMAT_ARGS(tlep[i]->end)))); @@ -522,18 +522,18 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib, if (XLogRecPtrIsInvalid(tli_missing_lsn)) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but no summaries for that timeline and LSN range exist", + errmsg("WAL summaries are required on timeline %u from %X/%08X to %X/%08X, but no summaries for that timeline and LSN range exist", tle->tli, LSN_FORMAT_ARGS(tli_start_lsn), LSN_FORMAT_ARGS(tli_end_lsn)))); else ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but the summaries for that timeline and LSN range are incomplete", + errmsg("WAL summaries are required on timeline %u from %X/%08X to %X/%08X, but the summaries for that timeline and LSN range are incomplete", tle->tli, LSN_FORMAT_ARGS(tli_start_lsn), LSN_FORMAT_ARGS(tli_end_lsn)), - errdetail("The first unsummarized LSN in this range is %X/%X.", + errdetail("The first unsummarized LSN in this range is %X/%08X.", LSN_FORMAT_ARGS(tli_missing_lsn)))); } diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 6db864892d0dd..fc8638c1b61b6 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -109,6 +109,8 @@ static const struct typinfo TypInfo[] = { F_REGROLEIN, F_REGROLEOUT}, {"regnamespace", REGNAMESPACEOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, F_REGNAMESPACEIN, F_REGNAMESPACEOUT}, + {"regdatabase", REGDATABASEOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, + F_REGDATABASEIN, F_REGDATABASEOUT}, {"text", TEXTOID, 0, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, DEFAULT_COLLATION_OID, F_TEXTIN, F_TEXTOUT}, {"oid", OIDOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 18316a3968bcf..7dded634eb810 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -1850,6 +1850,17 @@ find_expr_references_walker(Node *node, errmsg("constant of the type %s cannot be used here", "regrole"))); break; + + /* + * Dependencies for regdatabase should be shared among all + * databases, so explicitly inhibit to have dependencies. + */ + case REGDATABASEOID: + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("constant of the type %s cannot be used here", + "regdatabase"))); + break; } } return false; diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index aa216683b74fe..c4029a4f3d310 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -800,11 +800,11 @@ index_create(Relation heapRelation, errmsg("user-defined indexes on system catalog tables are not supported"))); /* - * Btree text_pattern_ops uses text_eq as the equality operator, which is - * fine as long as the collation is deterministic; text_eq then reduces to + * Btree text_pattern_ops uses texteq as the equality operator, which is + * fine as long as the collation is deterministic; texteq then reduces to * bitwise equality and so it is semantically compatible with the other * operators and functions in that opclass. But with a nondeterministic - * collation, text_eq could yield results that are incompatible with the + * collation, texteq could yield results that are incompatible with the * actual behavior of the index (which is determined by the opclass's * comparison function). We prevent such problems by refusing creation of * an index with that opclass and a nondeterministic collation. @@ -814,7 +814,7 @@ index_create(Relation heapRelation, * opclasses as incompatible with nondeterminism; but for now, this small * hack suffices. * - * Another solution is to use a special operator, not text_eq, as the + * Another solution is to use a special operator, not texteq, as the * equality opclass member; but that is undesirable because it would * prevent index usage in many queries that work fine today. */ diff --git a/src/backend/catalog/pg_subscription.c b/src/backend/catalog/pg_subscription.c index 1395032413e3d..63c2992d19f75 100644 --- a/src/backend/catalog/pg_subscription.c +++ b/src/backend/catalog/pg_subscription.c @@ -103,6 +103,7 @@ GetSubscription(Oid subid, bool missing_ok) sub->passwordrequired = subform->subpasswordrequired; sub->runasowner = subform->subrunasowner; sub->failover = subform->subfailover; + sub->retaindeadtuples = subform->subretaindeadtuples; /* Get conninfo */ datum = SysCacheGetAttrNotNull(SUBSCRIPTIONOID, diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 08f780a2e6382..f6eca09ee153a 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -666,6 +666,14 @@ GRANT SELECT ON pg_shmem_allocations_numa TO pg_read_all_stats; REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() FROM PUBLIC; GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() TO pg_read_all_stats; +CREATE VIEW pg_dsm_registry_allocations AS + SELECT * FROM pg_get_dsm_registry_allocations(); + +REVOKE ALL ON pg_dsm_registry_allocations FROM PUBLIC; +GRANT SELECT ON pg_dsm_registry_allocations TO pg_read_all_stats; +REVOKE EXECUTE ON FUNCTION pg_get_dsm_registry_allocations() FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pg_get_dsm_registry_allocations() TO pg_read_all_stats; + CREATE VIEW pg_backend_memory_contexts AS SELECT * FROM pg_get_backend_memory_contexts(); @@ -895,7 +903,7 @@ CREATE VIEW pg_stat_activity AS S.wait_event, S.state, S.backend_xid, - s.backend_xmin, + S.backend_xmin, S.query_id, S.query, S.backend_type @@ -1378,7 +1386,8 @@ REVOKE ALL ON pg_subscription FROM public; GRANT SELECT (oid, subdbid, subskiplsn, subname, subowner, subenabled, subbinary, substream, subtwophasestate, subdisableonerr, subpasswordrequired, subrunasowner, subfailover, - subslotname, subsynccommit, subpublications, suborigin) + subretaindeadtuples, subslotname, subsynccommit, + subpublications, suborigin) ON pg_subscription TO public; CREATE VIEW pg_stat_subscription_stats AS diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 4fffb76e55735..7111d5d5334f2 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -76,7 +76,7 @@ static BufferAccessStrategy vac_strategy; static void do_analyze_rel(Relation onerel, - VacuumParams *params, List *va_cols, + const VacuumParams params, List *va_cols, AcquireSampleRowsFunc acquirefunc, BlockNumber relpages, bool inh, bool in_outer_xact, int elevel); static void compute_index_stats(Relation onerel, double totalrows, @@ -107,7 +107,7 @@ static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); */ void analyze_rel(Oid relid, RangeVar *relation, - VacuumParams *params, List *va_cols, bool in_outer_xact, + const VacuumParams params, List *va_cols, bool in_outer_xact, BufferAccessStrategy bstrategy) { Relation onerel; @@ -116,7 +116,7 @@ analyze_rel(Oid relid, RangeVar *relation, BlockNumber relpages = 0; /* Select logging level */ - if (params->options & VACOPT_VERBOSE) + if (params.options & VACOPT_VERBOSE) elevel = INFO; else elevel = DEBUG2; @@ -138,8 +138,8 @@ analyze_rel(Oid relid, RangeVar *relation, * * Make sure to generate only logs for ANALYZE in this case. */ - onerel = vacuum_open_relation(relid, relation, params->options & ~(VACOPT_VACUUM), - params->log_min_duration >= 0, + onerel = vacuum_open_relation(relid, relation, params.options & ~(VACOPT_VACUUM), + params.log_min_duration >= 0, ShareUpdateExclusiveLock); /* leave if relation could not be opened or locked */ @@ -155,7 +155,7 @@ analyze_rel(Oid relid, RangeVar *relation, */ if (!vacuum_is_permitted_for_relation(RelationGetRelid(onerel), onerel->rd_rel, - params->options & ~VACOPT_VACUUM)) + params.options & ~VACOPT_VACUUM)) { relation_close(onerel, ShareUpdateExclusiveLock); return; @@ -227,7 +227,7 @@ analyze_rel(Oid relid, RangeVar *relation, else { /* No need for a WARNING if we already complained during VACUUM */ - if (!(params->options & VACOPT_VACUUM)) + if (!(params.options & VACOPT_VACUUM)) ereport(WARNING, (errmsg("skipping \"%s\" --- cannot analyze non-tables or special system tables", RelationGetRelationName(onerel)))); @@ -275,7 +275,7 @@ analyze_rel(Oid relid, RangeVar *relation, * appropriate acquirefunc for each child table. */ static void -do_analyze_rel(Relation onerel, VacuumParams *params, +do_analyze_rel(Relation onerel, const VacuumParams params, List *va_cols, AcquireSampleRowsFunc acquirefunc, BlockNumber relpages, bool inh, bool in_outer_xact, int elevel) @@ -309,9 +309,9 @@ do_analyze_rel(Relation onerel, VacuumParams *params, PgStat_Counter startreadtime = 0; PgStat_Counter startwritetime = 0; - verbose = (params->options & VACOPT_VERBOSE) != 0; + verbose = (params.options & VACOPT_VERBOSE) != 0; instrument = (verbose || (AmAutoVacuumWorkerProcess() && - params->log_min_duration >= 0)); + params.log_min_duration >= 0)); if (inh) ereport(elevel, (errmsg("analyzing \"%s.%s\" inheritance tree", @@ -706,7 +706,7 @@ do_analyze_rel(Relation onerel, VacuumParams *params, * amvacuumcleanup() when called in ANALYZE-only mode. The only exception * among core index AMs is GIN/ginvacuumcleanup(). */ - if (!(params->options & VACOPT_VACUUM)) + if (!(params.options & VACOPT_VACUUM)) { for (ind = 0; ind < nindexes; ind++) { @@ -736,9 +736,9 @@ do_analyze_rel(Relation onerel, VacuumParams *params, { TimestampTz endtime = GetCurrentTimestamp(); - if (verbose || params->log_min_duration == 0 || + if (verbose || params.log_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, - params->log_min_duration)) + params.log_min_duration)) { long delay_in_ms; WalUsage walusage; diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index 54a08e4102e14..b55221d44cd00 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -917,7 +917,7 @@ copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verb * not to be aggressive about this. */ memset(¶ms, 0, sizeof(VacuumParams)); - vacuum_get_cutoffs(OldHeap, ¶ms, &cutoffs); + vacuum_get_cutoffs(OldHeap, params, &cutoffs); /* * FreezeXid will become the table's new relfrozenxid, and that mustn't go diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 74ae42b19a710..fae9c41db6565 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -322,11 +322,13 @@ DoCopy(ParseState *pstate, const CopyStmt *stmt, } /* - * Extract a CopyHeaderChoice value from a DefElem. This is like - * defGetBoolean() but also accepts the special value "match". + * Extract the CopyFormatOptions.header_line value from a DefElem. + * + * Parses the HEADER option for COPY, which can be a boolean, a non-negative + * integer (number of lines to skip), or the special value "match". */ -static CopyHeaderChoice -defGetCopyHeaderChoice(DefElem *def, bool is_from) +static int +defGetCopyHeaderOption(DefElem *def, bool is_from) { /* * If no parameter value given, assume "true" is meant. @@ -335,20 +337,27 @@ defGetCopyHeaderChoice(DefElem *def, bool is_from) return COPY_HEADER_TRUE; /* - * Allow 0, 1, "true", "false", "on", "off", or "match". + * Allow 0, 1, "true", "false", "on", "off", a non-negative integer, or + * "match". */ switch (nodeTag(def->arg)) { case T_Integer: - switch (intVal(def->arg)) { - case 0: - return COPY_HEADER_FALSE; - case 1: - return COPY_HEADER_TRUE; - default: - /* otherwise, error out below */ - break; + int ival = intVal(def->arg); + + if (ival < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("a negative integer value cannot be " + "specified for %s", def->defname))); + + if (!is_from && ival > 1) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot use multi-line header in COPY TO"))); + + return ival; } break; default: @@ -381,7 +390,8 @@ defGetCopyHeaderChoice(DefElem *def, bool is_from) } ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("%s requires a Boolean value or \"match\"", + errmsg("%s requires a Boolean value, a non-negative integer, " + "or the string \"match\"", def->defname))); return COPY_HEADER_FALSE; /* keep compiler quiet */ } @@ -566,7 +576,7 @@ ProcessCopyOptions(ParseState *pstate, if (header_specified) errorConflictingDefElem(defel, pstate); header_specified = true; - opts_out->header_line = defGetCopyHeaderChoice(defel, is_from); + opts_out->header_line = defGetCopyHeaderOption(defel, is_from); } else if (strcmp(defel->defname, "quote") == 0) { @@ -769,7 +779,7 @@ ProcessCopyOptions(ParseState *pstate, errmsg("COPY delimiter cannot be \"%s\"", opts_out->delim))); /* Check header */ - if (opts_out->binary && opts_out->header_line) + if (opts_out->binary && opts_out->header_line != COPY_HEADER_FALSE) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), /*- translator: %s is the name of a COPY option, e.g. ON_ERROR */ diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index f5fc346e2013b..b1ae97b833dff 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -771,21 +771,30 @@ static pg_attribute_always_inline bool NextCopyFromRawFieldsInternal(CopyFromState cstate, char ***fields, int *nfields, bool is_csv) { int fldct; - bool done; + bool done = false; /* only available for text or csv input */ Assert(!cstate->opts.binary); /* on input check that the header line is correct if needed */ - if (cstate->cur_lineno == 0 && cstate->opts.header_line) + if (cstate->cur_lineno == 0 && cstate->opts.header_line != COPY_HEADER_FALSE) { ListCell *cur; TupleDesc tupDesc; + int lines_to_skip = cstate->opts.header_line; + + /* If set to "match", one header line is skipped */ + if (cstate->opts.header_line == COPY_HEADER_MATCH) + lines_to_skip = 1; tupDesc = RelationGetDescr(cstate->rel); - cstate->cur_lineno++; - done = CopyReadLine(cstate, is_csv); + for (int i = 0; i < lines_to_skip; i++) + { + cstate->cur_lineno++; + if ((done = CopyReadLine(cstate, is_csv))) + break; + } if (cstate->opts.header_line == COPY_HEADER_MATCH) { @@ -1538,7 +1547,7 @@ GetDecimalFromHex(char hex) if (isdigit((unsigned char) hex)) return hex - '0'; else - return tolower((unsigned char) hex) - 'a' + 10; + return pg_ascii_tolower((unsigned char) hex) - 'a' + 10; } /* diff --git a/src/backend/commands/copyto.c b/src/backend/commands/copyto.c index ea6f18f2c8008..67b94b91cae44 100644 --- a/src/backend/commands/copyto.c +++ b/src/backend/commands/copyto.c @@ -199,7 +199,7 @@ CopyToTextLikeStart(CopyToState cstate, TupleDesc tupDesc) cstate->file_encoding); /* if a header has been requested send the line */ - if (cstate->opts.header_line) + if (cstate->opts.header_line == COPY_HEADER_TRUE) { ListCell *cur; bool hdr_delim = false; diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index c95eb94501671..502a45163c8ae 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -570,8 +570,8 @@ CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid, * any CREATE DATABASE commands. */ if (!IsBinaryUpgrade) - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | - CHECKPOINT_WAIT | CHECKPOINT_FLUSH_ALL); + RequestCheckpoint(CHECKPOINT_FAST | CHECKPOINT_FORCE | + CHECKPOINT_WAIT | CHECKPOINT_FLUSH_UNLOGGED); /* * Iterate through all tablespaces of the template database, and copy each @@ -673,7 +673,7 @@ CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid, * strategy that avoids these problems. */ if (!IsBinaryUpgrade) - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | + RequestCheckpoint(CHECKPOINT_FAST | CHECKPOINT_FORCE | CHECKPOINT_WAIT); } @@ -1870,7 +1870,7 @@ dropdb(const char *dbname, bool missing_ok, bool force) * Force a checkpoint to make sure the checkpointer has received the * message sent by ForgetDatabaseSyncRequests. */ - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT); + RequestCheckpoint(CHECKPOINT_FAST | CHECKPOINT_FORCE | CHECKPOINT_WAIT); /* Close all smgr fds in all backends. */ WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE)); @@ -2120,8 +2120,8 @@ movedb(const char *dbname, const char *tblspcname) * On Windows, this also ensures that background procs don't hold any open * files, which would cause rmdir() to fail. */ - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT - | CHECKPOINT_FLUSH_ALL); + RequestCheckpoint(CHECKPOINT_FAST | CHECKPOINT_FORCE | CHECKPOINT_WAIT + | CHECKPOINT_FLUSH_UNLOGGED); /* Close all smgr fds in all backends. */ WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE)); @@ -2252,7 +2252,7 @@ movedb(const char *dbname, const char *tblspcname) * any unlogged operations done in the new DB tablespace before the * next checkpoint. */ - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT); + RequestCheckpoint(CHECKPOINT_FAST | CHECKPOINT_FORCE | CHECKPOINT_WAIT); /* * Force synchronous commit, thus minimizing the window between diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 7e2792ead715b..8345bc0264b23 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -3582,6 +3582,7 @@ static void show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es) { Plan *plan = ((PlanState *) mstate)->plan; + Memoize *mplan = (Memoize *) plan; ListCell *lc; List *context; StringInfoData keystr; @@ -3602,7 +3603,7 @@ show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es) plan, ancestors); - foreach(lc, ((Memoize *) plan)->param_exprs) + foreach(lc, mplan->param_exprs) { Node *expr = (Node *) lfirst(lc); @@ -3618,6 +3619,24 @@ show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es) pfree(keystr.data); + if (es->costs) + { + if (es->format == EXPLAIN_FORMAT_TEXT) + { + ExplainIndentText(es); + appendStringInfo(es->str, "Estimates: capacity=%u distinct keys=%.0f lookups=%.0f hit percent=%.2f%%\n", + mplan->est_entries, mplan->est_unique_keys, + mplan->est_calls, mplan->est_hit_ratio * 100.0); + } + else + { + ExplainPropertyUInteger("Estimated Capacity", NULL, mplan->est_entries, es); + ExplainPropertyFloat("Estimated Distinct Lookup Keys", NULL, mplan->est_unique_keys, 0, es); + ExplainPropertyFloat("Estimated Lookups", NULL, mplan->est_calls, 0, es); + ExplainPropertyFloat("Estimated Hit Percent", NULL, mplan->est_hit_ratio * 100.0, 2, es); + } + } + if (!es->analyze) return; diff --git a/src/backend/commands/foreigncmds.c b/src/backend/commands/foreigncmds.c index 8d2d743154462..77f8461f42eee 100644 --- a/src/backend/commands/foreigncmds.c +++ b/src/backend/commands/foreigncmds.c @@ -1588,6 +1588,7 @@ ImportForeignSchema(ImportForeignSchemaStmt *stmt) pstmt->utilityStmt = (Node *) cstmt; pstmt->stmt_location = rs->stmt_location; pstmt->stmt_len = rs->stmt_len; + pstmt->planOrigin = PLAN_STMT_INTERNAL; /* Execute statement */ ProcessUtility(pstmt, cmd, false, diff --git a/src/backend/commands/schemacmds.c b/src/backend/commands/schemacmds.c index 546160f09410e..0f03d9743d203 100644 --- a/src/backend/commands/schemacmds.c +++ b/src/backend/commands/schemacmds.c @@ -215,6 +215,7 @@ CreateSchemaCommand(CreateSchemaStmt *stmt, const char *queryString, wrapper->utilityStmt = stmt; wrapper->stmt_location = stmt_location; wrapper->stmt_len = stmt_len; + wrapper->planOrigin = PLAN_STMT_INTERNAL; /* do this step */ ProcessUtility(wrapper, diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c index 4ff246cd94321..cd6c3684482f9 100644 --- a/src/backend/commands/subscriptioncmds.c +++ b/src/backend/commands/subscriptioncmds.c @@ -14,6 +14,7 @@ #include "postgres.h" +#include "access/commit_ts.h" #include "access/htup_details.h" #include "access/table.h" #include "access/twophase.h" @@ -71,8 +72,9 @@ #define SUBOPT_PASSWORD_REQUIRED 0x00000800 #define SUBOPT_RUN_AS_OWNER 0x00001000 #define SUBOPT_FAILOVER 0x00002000 -#define SUBOPT_LSN 0x00004000 -#define SUBOPT_ORIGIN 0x00008000 +#define SUBOPT_RETAIN_DEAD_TUPLES 0x00004000 +#define SUBOPT_LSN 0x00008000 +#define SUBOPT_ORIGIN 0x00010000 /* check if the 'val' has 'bits' set */ #define IsSet(val, bits) (((val) & (bits)) == (bits)) @@ -98,6 +100,7 @@ typedef struct SubOpts bool passwordrequired; bool runasowner; bool failover; + bool retaindeadtuples; char *origin; XLogRecPtr lsn; } SubOpts; @@ -105,8 +108,10 @@ typedef struct SubOpts static List *fetch_table_list(WalReceiverConn *wrconn, List *publications); static void check_publications_origin(WalReceiverConn *wrconn, List *publications, bool copydata, - char *origin, Oid *subrel_local_oids, - int subrel_count, char *subname); + bool retain_dead_tuples, char *origin, + Oid *subrel_local_oids, int subrel_count, + char *subname); +static void check_pub_dead_tuple_retention(WalReceiverConn *wrconn); static void check_duplicates_in_publist(List *publist, Datum *datums); static List *merge_publications(List *oldpublist, List *newpublist, bool addpub, const char *subname); static void ReportSlotConnectionError(List *rstates, Oid subid, char *slotname, char *err); @@ -162,6 +167,8 @@ parse_subscription_options(ParseState *pstate, List *stmt_options, opts->runasowner = false; if (IsSet(supported_opts, SUBOPT_FAILOVER)) opts->failover = false; + if (IsSet(supported_opts, SUBOPT_RETAIN_DEAD_TUPLES)) + opts->retaindeadtuples = false; if (IsSet(supported_opts, SUBOPT_ORIGIN)) opts->origin = pstrdup(LOGICALREP_ORIGIN_ANY); @@ -210,7 +217,7 @@ parse_subscription_options(ParseState *pstate, List *stmt_options, if (strcmp(opts->slot_name, "none") == 0) opts->slot_name = NULL; else - ReplicationSlotValidateName(opts->slot_name, ERROR); + ReplicationSlotValidateName(opts->slot_name, false, ERROR); } else if (IsSet(supported_opts, SUBOPT_COPY_DATA) && strcmp(defel->defname, "copy_data") == 0) @@ -307,6 +314,15 @@ parse_subscription_options(ParseState *pstate, List *stmt_options, opts->specified_opts |= SUBOPT_FAILOVER; opts->failover = defGetBoolean(defel); } + else if (IsSet(supported_opts, SUBOPT_RETAIN_DEAD_TUPLES) && + strcmp(defel->defname, "retain_dead_tuples") == 0) + { + if (IsSet(opts->specified_opts, SUBOPT_RETAIN_DEAD_TUPLES)) + errorConflictingDefElem(defel, pstate); + + opts->specified_opts |= SUBOPT_RETAIN_DEAD_TUPLES; + opts->retaindeadtuples = defGetBoolean(defel); + } else if (IsSet(supported_opts, SUBOPT_ORIGIN) && strcmp(defel->defname, "origin") == 0) { @@ -563,7 +579,8 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, SUBOPT_SYNCHRONOUS_COMMIT | SUBOPT_BINARY | SUBOPT_STREAMING | SUBOPT_TWOPHASE_COMMIT | SUBOPT_DISABLE_ON_ERR | SUBOPT_PASSWORD_REQUIRED | - SUBOPT_RUN_AS_OWNER | SUBOPT_FAILOVER | SUBOPT_ORIGIN); + SUBOPT_RUN_AS_OWNER | SUBOPT_FAILOVER | + SUBOPT_RETAIN_DEAD_TUPLES | SUBOPT_ORIGIN); parse_subscription_options(pstate, stmt->options, supported_opts, &opts); /* @@ -630,6 +647,10 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, stmt->subname))); } + /* Ensure that we can enable retain_dead_tuples */ + if (opts.retaindeadtuples) + CheckSubDeadTupleRetention(true, !opts.enabled, WARNING); + if (!IsSet(opts.specified_opts, SUBOPT_SLOT_NAME) && opts.slot_name == NULL) opts.slot_name = stmt->subname; @@ -670,6 +691,8 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, values[Anum_pg_subscription_subpasswordrequired - 1] = BoolGetDatum(opts.passwordrequired); values[Anum_pg_subscription_subrunasowner - 1] = BoolGetDatum(opts.runasowner); values[Anum_pg_subscription_subfailover - 1] = BoolGetDatum(opts.failover); + values[Anum_pg_subscription_subretaindeadtuples - 1] = + BoolGetDatum(opts.retaindeadtuples); values[Anum_pg_subscription_subconninfo - 1] = CStringGetTextDatum(conninfo); if (opts.slot_name) @@ -722,7 +745,11 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, { check_publications(wrconn, publications); check_publications_origin(wrconn, publications, opts.copy_data, - opts.origin, NULL, 0, stmt->subname); + opts.retaindeadtuples, opts.origin, + NULL, 0, stmt->subname); + + if (opts.retaindeadtuples) + check_pub_dead_tuple_retention(wrconn); /* * Set sync state based on if we were asked to do data copy or @@ -881,8 +908,8 @@ AlterSubscription_refresh(Subscription *sub, bool copy_data, sizeof(Oid), oid_cmp); check_publications_origin(wrconn, sub->publications, copy_data, - sub->origin, subrel_local_oids, - subrel_count, sub->name); + sub->retaindeadtuples, sub->origin, + subrel_local_oids, subrel_count, sub->name); /* * Rels that we want to remove from subscription and drop any slots @@ -1040,18 +1067,22 @@ AlterSubscription_refresh(Subscription *sub, bool copy_data, } /* - * Common checks for altering failover and two_phase options. + * Common checks for altering failover, two_phase, and retain_dead_tuples + * options. */ static void CheckAlterSubOption(Subscription *sub, const char *option, bool slot_needs_update, bool isTopLevel) { + Assert(strcmp(option, "failover") == 0 || + strcmp(option, "two_phase") == 0 || + strcmp(option, "retain_dead_tuples") == 0); + /* - * The checks in this function are required only for failover and - * two_phase options. + * Altering the retain_dead_tuples option does not update the slot on the + * publisher. */ - Assert(strcmp(option, "failover") == 0 || - strcmp(option, "two_phase") == 0); + Assert(!slot_needs_update || strcmp(option, "retain_dead_tuples") != 0); /* * Do not allow changing the option if the subscription is enabled. This @@ -1063,6 +1094,39 @@ CheckAlterSubOption(Subscription *sub, const char *option, * the publisher by the existing walsender, so we could have allowed that * even when the subscription is enabled. But we kept this restriction for * the sake of consistency and simplicity. + * + * Additionally, do not allow changing the retain_dead_tuples option when + * the subscription is enabled to prevent race conditions arising from the + * new option value being acknowledged asynchronously by the launcher and + * apply workers. + * + * Without the restriction, a race condition may arise when a user + * disables and immediately re-enables the retain_dead_tuples option. In + * this case, the launcher might drop the slot upon noticing the disabled + * action, while the apply worker may keep maintaining + * oldest_nonremovable_xid without noticing the option change. During this + * period, a transaction ID wraparound could falsely make this ID appear + * as if it originates from the future w.r.t the transaction ID stored in + * the slot maintained by launcher. + * + * Similarly, if the user enables retain_dead_tuples concurrently with the + * launcher starting the worker, the apply worker may start calculating + * oldest_nonremovable_xid before the launcher notices the enable action. + * Consequently, the launcher may update slot.xmin to a newer value than + * that maintained by the worker. In subsequent cycles, upon integrating + * the worker's oldest_nonremovable_xid, the launcher might detect a + * retreat in the calculated xmin, necessitating additional handling. + * + * XXX To address the above race conditions, we can define + * oldest_nonremovable_xid as FullTransactionID and adds the check to + * disallow retreating the conflict slot's xmin. For now, we kept the + * implementation simple by disallowing change to the retain_dead_tuples, + * but in the future we can change this after some more analysis. + * + * Note that we could restrict only the enabling of retain_dead_tuples to + * avoid the race conditions described above, but we maintain the + * restriction for both enable and disable operations for the sake of + * consistency. */ if (sub->enabled) ereport(ERROR, @@ -1110,6 +1174,9 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, bool update_tuple = false; bool update_failover = false; bool update_two_phase = false; + bool check_pub_rdt = false; + bool retain_dead_tuples; + char *origin; Subscription *sub; Form_pg_subscription form; bits32 supported_opts; @@ -1137,6 +1204,9 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, sub = GetSubscription(subid, false); + retain_dead_tuples = sub->retaindeadtuples; + origin = sub->origin; + /* * Don't allow non-superuser modification of a subscription with * password_required=false. @@ -1165,7 +1235,7 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, SUBOPT_DISABLE_ON_ERR | SUBOPT_PASSWORD_REQUIRED | SUBOPT_RUN_AS_OWNER | SUBOPT_FAILOVER | - SUBOPT_ORIGIN); + SUBOPT_RETAIN_DEAD_TUPLES | SUBOPT_ORIGIN); parse_subscription_options(pstate, stmt->options, supported_opts, &opts); @@ -1325,11 +1395,62 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, replaces[Anum_pg_subscription_subfailover - 1] = true; } + if (IsSet(opts.specified_opts, SUBOPT_RETAIN_DEAD_TUPLES)) + { + values[Anum_pg_subscription_subretaindeadtuples - 1] = + BoolGetDatum(opts.retaindeadtuples); + replaces[Anum_pg_subscription_subretaindeadtuples - 1] = true; + + CheckAlterSubOption(sub, "retain_dead_tuples", false, isTopLevel); + + /* + * Workers may continue running even after the + * subscription has been disabled. + * + * To prevent race conditions (as described in + * CheckAlterSubOption()), ensure that all worker + * processes have already exited before proceeding. + */ + if (logicalrep_workers_find(subid, true, true)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot alter retain_dead_tuples when logical replication worker is still running"), + errhint("Try again after some time."))); + + /* + * Remind the user that enabling subscription will prevent + * the accumulation of dead tuples. + */ + if (opts.retaindeadtuples) + CheckSubDeadTupleRetention(true, !sub->enabled, NOTICE); + + /* + * Notify the launcher to manage the replication slot for + * conflict detection. This ensures that replication slot + * is efficiently handled (created, updated, or dropped) + * in response to any configuration changes. + */ + ApplyLauncherWakeupAtCommit(); + + check_pub_rdt = opts.retaindeadtuples; + retain_dead_tuples = opts.retaindeadtuples; + } + if (IsSet(opts.specified_opts, SUBOPT_ORIGIN)) { values[Anum_pg_subscription_suborigin - 1] = CStringGetTextDatum(opts.origin); replaces[Anum_pg_subscription_suborigin - 1] = true; + + /* + * Check if changes from different origins may be received + * from the publisher when the origin is changed to ANY + * and retain_dead_tuples is enabled. + */ + check_pub_rdt = retain_dead_tuples && + pg_strcasecmp(opts.origin, LOGICALREP_ORIGIN_ANY) == 0; + + origin = opts.origin; } update_tuple = true; @@ -1347,6 +1468,15 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cannot enable subscription that does not have a slot name"))); + /* + * Check track_commit_timestamp only when enabling the + * subscription in case it was disabled after creation. See + * comments atop CheckSubDeadTupleRetention() for details. + */ + if (sub->retaindeadtuples) + CheckSubDeadTupleRetention(opts.enabled, !opts.enabled, + WARNING); + values[Anum_pg_subscription_subenabled - 1] = BoolGetDatum(opts.enabled); replaces[Anum_pg_subscription_subenabled - 1] = true; @@ -1355,6 +1485,14 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, ApplyLauncherWakeupAtCommit(); update_tuple = true; + + /* + * The subscription might be initially created with + * connect=false and retain_dead_tuples=true, meaning the + * remote server's status may not be checked. Ensure this + * check is conducted now. + */ + check_pub_rdt = sub->retaindeadtuples && opts.enabled; break; } @@ -1369,6 +1507,13 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, CStringGetTextDatum(stmt->conninfo); replaces[Anum_pg_subscription_subconninfo - 1] = true; update_tuple = true; + + /* + * Since the remote server configuration might have changed, + * perform a check to ensure it permits enabling + * retain_dead_tuples. + */ + check_pub_rdt = sub->retaindeadtuples; break; case ALTER_SUBSCRIPTION_SET_PUBLICATION: @@ -1539,7 +1684,7 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, if (!XLogRecPtrIsInvalid(remote_lsn) && opts.lsn < remote_lsn) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("skip WAL location (LSN %X/%X) must be greater than origin LSN %X/%X", + errmsg("skip WAL location (LSN %X/%08X) must be greater than origin LSN %X/%08X", LSN_FORMAT_ARGS(opts.lsn), LSN_FORMAT_ARGS(remote_lsn)))); } @@ -1568,14 +1713,15 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, } /* - * Try to acquire the connection necessary for altering the slot, if - * needed. + * Try to acquire the connection necessary either for modifying the slot + * or for checking if the remote server permits enabling + * retain_dead_tuples. * * This has to be at the end because otherwise if there is an error while * doing the database operations we won't be able to rollback altered * slot. */ - if (update_failover || update_two_phase) + if (update_failover || update_two_phase || check_pub_rdt) { bool must_use_password; char *err; @@ -1584,10 +1730,14 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, /* Load the library providing us libpq calls. */ load_file("libpqwalreceiver", false); - /* Try to connect to the publisher. */ + /* + * Try to connect to the publisher, using the new connection string if + * available. + */ must_use_password = sub->passwordrequired && !sub->ownersuperuser; - wrconn = walrcv_connect(sub->conninfo, true, true, must_use_password, - sub->name, &err); + wrconn = walrcv_connect(stmt->conninfo ? stmt->conninfo : sub->conninfo, + true, true, must_use_password, sub->name, + &err); if (!wrconn) ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), @@ -1596,9 +1746,17 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, PG_TRY(); { - walrcv_alter_slot(wrconn, sub->slotname, - update_failover ? &opts.failover : NULL, - update_two_phase ? &opts.twophase : NULL); + if (retain_dead_tuples) + check_pub_dead_tuple_retention(wrconn); + + check_publications_origin(wrconn, sub->publications, false, + retain_dead_tuples, origin, NULL, 0, + sub->name); + + if (update_failover || update_two_phase) + walrcv_alter_slot(wrconn, sub->slotname, + update_failover ? &opts.failover : NULL, + update_two_phase ? &opts.twophase : NULL); } PG_FINALLY(); { @@ -2086,20 +2244,29 @@ AlterSubscriptionOwner_oid(Oid subid, Oid newOwnerId) * Check and log a warning if the publisher has subscribed to the same table, * its partition ancestors (if it's a partition), or its partition children (if * it's a partitioned table), from some other publishers. This check is - * required only if "copy_data = true" and "origin = none" for CREATE - * SUBSCRIPTION and ALTER SUBSCRIPTION ... REFRESH statements to notify the - * user that data having origin might have been copied. + * required in the following scenarios: * - * This check need not be performed on the tables that are already added - * because incremental sync for those tables will happen through WAL and the - * origin of the data can be identified from the WAL records. + * 1) For CREATE SUBSCRIPTION and ALTER SUBSCRIPTION ... REFRESH statements + * with "copy_data = true" and "origin = none": + * - Warn the user that data with an origin might have been copied. + * - This check is skipped for tables already added, as incremental sync via + * WAL allows origin tracking. The list of such tables is in + * subrel_local_oids. * - * subrel_local_oids contains the list of relation oids that are already - * present on the subscriber. + * 2) For CREATE SUBSCRIPTION and ALTER SUBSCRIPTION ... REFRESH statements + * with "retain_dead_tuples = true" and "origin = any", and for ALTER + * SUBSCRIPTION statements that modify retain_dead_tuples or origin, or + * when the publisher's status changes (e.g., due to a connection string + * update): + * - Warn the user that only conflict detection info for local changes on + * the publisher is retained. Data from other origins may lack sufficient + * details for reliable conflict detection. + * - See comments atop worker.c for more details. */ static void check_publications_origin(WalReceiverConn *wrconn, List *publications, - bool copydata, char *origin, Oid *subrel_local_oids, + bool copydata, bool retain_dead_tuples, + char *origin, Oid *subrel_local_oids, int subrel_count, char *subname) { WalRcvExecResult *res; @@ -2108,9 +2275,29 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications, Oid tableRow[1] = {TEXTOID}; List *publist = NIL; int i; + bool check_rdt; + bool check_table_sync; + bool origin_none = origin && + pg_strcasecmp(origin, LOGICALREP_ORIGIN_NONE) == 0; + + /* + * Enable retain_dead_tuples checks only when origin is set to 'any', + * since with origin='none' only local changes are replicated to the + * subscriber. + */ + check_rdt = retain_dead_tuples && !origin_none; + + /* + * Enable table synchronization checks only when origin is 'none', to + * ensure that data from other origins is not inadvertently copied. + */ + check_table_sync = copydata && origin_none; - if (!copydata || !origin || - (pg_strcasecmp(origin, LOGICALREP_ORIGIN_NONE) != 0)) + /* retain_dead_tuples and table sync checks occur separately */ + Assert(!(check_rdt && check_table_sync)); + + /* Return if no checks are required */ + if (!check_rdt && !check_table_sync) return; initStringInfo(&cmd); @@ -2129,16 +2316,23 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications, /* * In case of ALTER SUBSCRIPTION ... REFRESH, subrel_local_oids contains * the list of relation oids that are already present on the subscriber. - * This check should be skipped for these tables. + * This check should be skipped for these tables if checking for table + * sync scenario. However, when handling the retain_dead_tuples scenario, + * ensure all tables are checked, as some existing tables may now include + * changes from other origins due to newly created subscriptions on the + * publisher. */ - for (i = 0; i < subrel_count; i++) + if (check_table_sync) { - Oid relid = subrel_local_oids[i]; - char *schemaname = get_namespace_name(get_rel_namespace(relid)); - char *tablename = get_rel_name(relid); + for (i = 0; i < subrel_count; i++) + { + Oid relid = subrel_local_oids[i]; + char *schemaname = get_namespace_name(get_rel_namespace(relid)); + char *tablename = get_rel_name(relid); - appendStringInfo(&cmd, "AND NOT (N.nspname = '%s' AND C.relname = '%s')\n", - schemaname, tablename); + appendStringInfo(&cmd, "AND NOT (N.nspname = '%s' AND C.relname = '%s')\n", + schemaname, tablename); + } } res = walrcv_exec(wrconn, cmd.data, 1, tableRow); @@ -2173,22 +2367,37 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications, * XXX: For simplicity, we don't check whether the table has any data or * not. If the table doesn't have any data then we don't need to * distinguish between data having origin and data not having origin so we - * can avoid logging a warning in that case. + * can avoid logging a warning for table sync scenario. */ if (publist) { StringInfo pubnames = makeStringInfo(); + StringInfo err_msg = makeStringInfo(); + StringInfo err_hint = makeStringInfo(); /* Prepare the list of publication(s) for warning message. */ GetPublicationsStr(publist, pubnames, false); + + if (check_table_sync) + { + appendStringInfo(err_msg, _("subscription \"%s\" requested copy_data with origin = NONE but might copy data that had a different origin"), + subname); + appendStringInfoString(err_hint, _("Verify that initial data copied from the publisher tables did not come from other origins.")); + } + else + { + appendStringInfo(err_msg, _("subscription \"%s\" enabled retain_dead_tuples but might not reliably detect conflicts for changes from different origins"), + subname); + appendStringInfoString(err_hint, _("Consider using origin = NONE or disabling retain_dead_tuples.")); + } + ereport(WARNING, errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("subscription \"%s\" requested copy_data with origin = NONE but might copy data that had a different origin", - subname), - errdetail_plural("The subscription being created subscribes to a publication (%s) that contains tables that are written to by other subscriptions.", - "The subscription being created subscribes to publications (%s) that contain tables that are written to by other subscriptions.", + errmsg_internal("%s", err_msg->data), + errdetail_plural("The subscription subscribes to a publication (%s) that contains tables that are written to by other subscriptions.", + "The subscription subscribes to publications (%s) that contain tables that are written to by other subscriptions.", list_length(publist), pubnames->data), - errhint("Verify that initial data copied from the publisher tables did not come from other origins.")); + errhint_internal("%s", err_hint->data)); } ExecDropSingleTupleTableSlot(slot); @@ -2196,6 +2405,101 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications, walrcv_clear_result(res); } +/* + * Determine whether the retain_dead_tuples can be enabled based on the + * publisher's status. + * + * This option is disallowed if the publisher is running a version earlier + * than the PG19, or if the publisher is in recovery (i.e., it is a standby + * server). + * + * See comments atop worker.c for a detailed explanation. + */ +static void +check_pub_dead_tuple_retention(WalReceiverConn *wrconn) +{ + WalRcvExecResult *res; + Oid RecoveryRow[1] = {BOOLOID}; + TupleTableSlot *slot; + bool isnull; + bool remote_in_recovery; + + if (walrcv_server_version(wrconn) < 19000) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot enable retain_dead_tuples if the publisher is running a version earlier than PostgreSQL 19")); + + res = walrcv_exec(wrconn, "SELECT pg_is_in_recovery()", 1, RecoveryRow); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not obtain recovery progress from the publisher: %s", + res->err))); + + slot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple); + if (!tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + elog(ERROR, "failed to fetch tuple for the recovery progress"); + + remote_in_recovery = DatumGetBool(slot_getattr(slot, 1, &isnull)); + + if (remote_in_recovery) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot enable retain_dead_tuples if the publisher is in recovery.")); + + ExecDropSingleTupleTableSlot(slot); + + walrcv_clear_result(res); +} + +/* + * Check if the subscriber's configuration is adequate to enable the + * retain_dead_tuples option. + * + * Issue an ERROR if the wal_level does not support the use of replication + * slots when check_guc is set to true. + * + * Issue a WARNING if track_commit_timestamp is not enabled when check_guc is + * set to true. This is only to highlight the importance of enabling + * track_commit_timestamp instead of catching all the misconfigurations, as + * this setting can be adjusted after subscription creation. Without it, the + * apply worker will simply skip conflict detection. + * + * Issue a WARNING or NOTICE if the subscription is disabled. Do not raise an + * ERROR since users can only modify retain_dead_tuples for disabled + * subscriptions. And as long as the subscription is enabled promptly, it will + * not pose issues. + */ +void +CheckSubDeadTupleRetention(bool check_guc, bool sub_disabled, + int elevel_for_sub_disabled) +{ + Assert(elevel_for_sub_disabled == NOTICE || + elevel_for_sub_disabled == WARNING); + + if (check_guc && wal_level < WAL_LEVEL_REPLICA) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("\"wal_level\" is insufficient to create the replication slot required by retain_dead_tuples"), + errhint("\"wal_level\" must be set to \"replica\" or \"logical\" at server start.")); + + if (check_guc && !track_commit_timestamp) + ereport(WARNING, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("commit timestamp and origin data required for detecting conflicts won't be retained"), + errhint("Consider setting \"%s\" to true.", + "track_commit_timestamp")); + + if (sub_disabled) + ereport(elevel_for_sub_disabled, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("deleted rows to detect conflicts would not be removed until the subscription is enabled"), + (elevel_for_sub_disabled > NOTICE) + ? errhint("Consider setting %s to false.", + "retain_dead_tuples") : 0); +} + /* * Get the list of tables which belong to specified publications on the * publisher connection. diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index b8837f26cb4fd..cb811520c2959 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -2711,8 +2711,7 @@ MergeAttributes(List *columns, const List *supers, char relpersistence, RelationGetRelationName(relation)))); /* If existing rel is temp, it must belong to this session */ - if (relation->rd_rel->relpersistence == RELPERSISTENCE_TEMP && - !relation->rd_islocaltemp) + if (RELATION_IS_OTHER_TEMP(relation)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg(!is_partition @@ -15488,6 +15487,14 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) Oid relid; relid = IndexGetRelation(oldId, false); + + /* + * As above, make sure we have lock on the index's table if it's not + * the same table. + */ + if (relid != tab->relid) + LockRelationOid(relid, AccessExclusiveLock); + ATPostAlterTypeParse(oldId, relid, InvalidOid, (char *) lfirst(def_item), wqueue, lockmode, tab->rewrite); @@ -15504,6 +15511,20 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) Oid relid; relid = StatisticsGetRelation(oldId, false); + + /* + * As above, make sure we have lock on the statistics object's table + * if it's not the same table. However, we take + * ShareUpdateExclusiveLock here, aligning with the lock level used in + * CreateStatistics and RemoveStatisticsById. + * + * CAUTION: this should be done after all cases that grab + * AccessExclusiveLock, else we risk causing deadlock due to needing + * to promote our table lock. + */ + if (relid != tab->relid) + LockRelationOid(relid, ShareUpdateExclusiveLock); + ATPostAlterTypeParse(oldId, relid, InvalidOid, (char *) lfirst(def_item), wqueue, lockmode, tab->rewrite); @@ -15727,7 +15748,7 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd, { AlterDomainStmt *stmt = (AlterDomainStmt *) stm; - if (stmt->subtype == 'C') /* ADD CONSTRAINT */ + if (stmt->subtype == AD_AddConstraint) { Constraint *con = castNode(Constraint, stmt->def); AlterTableCmd *cmd = makeNode(AlterTableCmd); @@ -17230,15 +17251,13 @@ ATExecAddInherit(Relation child_rel, RangeVar *parent, LOCKMODE lockmode) RelationGetRelationName(parent_rel)))); /* If parent rel is temp, it must belong to this session */ - if (parent_rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP && - !parent_rel->rd_islocaltemp) + if (RELATION_IS_OTHER_TEMP(parent_rel)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot inherit from temporary relation of another session"))); /* Ditto for the child */ - if (child_rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP && - !child_rel->rd_islocaltemp) + if (RELATION_IS_OTHER_TEMP(child_rel)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot inherit to temporary relation of another session"))); @@ -20309,15 +20328,13 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd, RelationGetRelationName(rel)))); /* If the parent is temp, it must belong to this session */ - if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP && - !rel->rd_islocaltemp) + if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot attach as partition of temporary relation of another session"))); /* Ditto for the partition */ - if (attachrel->rd_rel->relpersistence == RELPERSISTENCE_TEMP && - !attachrel->rd_islocaltemp) + if (RELATION_IS_OTHER_TEMP(attachrel)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot attach temporary relation of another session as partition"))); diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c index a9005cc7212b6..df31eace47ac9 100644 --- a/src/backend/commands/tablespace.c +++ b/src/backend/commands/tablespace.c @@ -500,7 +500,7 @@ DropTableSpace(DropTableSpaceStmt *stmt) * mustn't delete. So instead, we force a checkpoint which will clean * out any lingering files, and try again. */ - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT); + RequestCheckpoint(CHECKPOINT_FAST | CHECKPOINT_FORCE | CHECKPOINT_WAIT); /* * On Windows, an unlinked file persists in the directory listing diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 67f8e70f9c166..7dc121f73f17e 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -80,6 +80,7 @@ static bool GetTupleForTrigger(EState *estate, ItemPointer tid, LockTupleMode lockmode, TupleTableSlot *oldslot, + bool do_epq_recheck, TupleTableSlot **epqslot, TM_Result *tmresultp, TM_FailureData *tmfdp); @@ -2693,7 +2694,8 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, HeapTuple fdw_trigtuple, TupleTableSlot **epqslot, TM_Result *tmresult, - TM_FailureData *tmfd) + TM_FailureData *tmfd, + bool is_merge_delete) { TupleTableSlot *slot = ExecGetTriggerOldSlot(estate, relinfo); TriggerDesc *trigdesc = relinfo->ri_TrigDesc; @@ -2708,9 +2710,17 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, { TupleTableSlot *epqslot_candidate = NULL; + /* + * Get a copy of the on-disk tuple we are planning to delete. In + * general, if the tuple has been concurrently updated, we should + * recheck it using EPQ. However, if this is a MERGE DELETE action, + * we skip this EPQ recheck and leave it to the caller (it must do + * additional rechecking, and might end up executing a different + * action entirely). + */ if (!GetTupleForTrigger(estate, epqstate, relinfo, tupleid, - LockTupleExclusive, slot, &epqslot_candidate, - tmresult, tmfd)) + LockTupleExclusive, slot, !is_merge_delete, + &epqslot_candidate, tmresult, tmfd)) return false; /* @@ -2800,6 +2810,7 @@ ExecARDeleteTriggers(EState *estate, tupleid, LockTupleExclusive, slot, + false, NULL, NULL, NULL); @@ -2944,7 +2955,8 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, HeapTuple fdw_trigtuple, TupleTableSlot *newslot, TM_Result *tmresult, - TM_FailureData *tmfd) + TM_FailureData *tmfd, + bool is_merge_update) { TriggerDesc *trigdesc = relinfo->ri_TrigDesc; TupleTableSlot *oldslot = ExecGetTriggerOldSlot(estate, relinfo); @@ -2965,10 +2977,17 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, { TupleTableSlot *epqslot_candidate = NULL; - /* get a copy of the on-disk tuple we are planning to update */ + /* + * Get a copy of the on-disk tuple we are planning to update. In + * general, if the tuple has been concurrently updated, we should + * recheck it using EPQ. However, if this is a MERGE UPDATE action, + * we skip this EPQ recheck and leave it to the caller (it must do + * additional rechecking, and might end up executing a different + * action entirely). + */ if (!GetTupleForTrigger(estate, epqstate, relinfo, tupleid, - lockmode, oldslot, &epqslot_candidate, - tmresult, tmfd)) + lockmode, oldslot, !is_merge_update, + &epqslot_candidate, tmresult, tmfd)) return false; /* cancel the update action */ /* @@ -3142,6 +3161,7 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, tupleid, LockTupleExclusive, oldslot, + false, NULL, NULL, NULL); @@ -3298,6 +3318,7 @@ GetTupleForTrigger(EState *estate, ItemPointer tid, LockTupleMode lockmode, TupleTableSlot *oldslot, + bool do_epq_recheck, TupleTableSlot **epqslot, TM_Result *tmresultp, TM_FailureData *tmfdp) @@ -3357,29 +3378,30 @@ GetTupleForTrigger(EState *estate, if (tmfd.traversed) { /* - * Recheck the tuple using EPQ. For MERGE, we leave this - * to the caller (it must do additional rechecking, and - * might end up executing a different action entirely). + * Recheck the tuple using EPQ, if requested. Otherwise, + * just return that it was concurrently updated. */ - if (estate->es_plannedstmt->commandType == CMD_MERGE) + if (do_epq_recheck) { - if (tmresultp) - *tmresultp = TM_Updated; - return false; + *epqslot = EvalPlanQual(epqstate, + relation, + relinfo->ri_RangeTableIndex, + oldslot); + + /* + * If PlanQual failed for updated tuple - we must not + * process this tuple! + */ + if (TupIsNull(*epqslot)) + { + *epqslot = NULL; + return false; + } } - - *epqslot = EvalPlanQual(epqstate, - relation, - relinfo->ri_RangeTableIndex, - oldslot); - - /* - * If PlanQual failed for updated tuple - we must not - * process this tuple! - */ - if (TupIsNull(*epqslot)) + else { - *epqslot = NULL; + if (tmresultp) + *tmresultp = TM_Updated; return false; } } diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c index 45ae7472ab5ad..26d985193aea4 100644 --- a/src/backend/commands/typecmds.c +++ b/src/backend/commands/typecmds.c @@ -939,11 +939,19 @@ DefineDomain(ParseState *pstate, CreateDomainStmt *stmt) break; case CONSTR_NOTNULL: - if (nullDefined && !typNotNull) + if (nullDefined) + { + if (!typNotNull) + ereport(ERROR, + errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting NULL/NOT NULL constraints"), + parser_errposition(pstate, constr->location)); + ereport(ERROR, - errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting NULL/NOT NULL constraints"), + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("redundant NOT NULL constraint definition"), parser_errposition(pstate, constr->location)); + } if (constr->is_no_inherit) ereport(ERROR, errcode(ERRCODE_INVALID_OBJECT_DEFINITION), diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 02993d320dafc..733ef40ae7c52 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -124,7 +124,7 @@ static void vac_truncate_clog(TransactionId frozenXID, MultiXactId minMulti, TransactionId lastSaneFrozenXid, MultiXactId lastSaneMinMulti); -static bool vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, +static bool vacuum_rel(Oid relid, RangeVar *relation, VacuumParams params, BufferAccessStrategy bstrategy); static double compute_parallel_delay(void); static VacOptValue get_vacoptval_from_boolean(DefElem *def); @@ -465,7 +465,7 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) } /* Now go through the common routine */ - vacuum(vacstmt->rels, ¶ms, bstrategy, vac_context, isTopLevel); + vacuum(vacstmt->rels, params, bstrategy, vac_context, isTopLevel); /* Finally, clean up the vacuum memory context */ MemoryContextDelete(vac_context); @@ -494,7 +494,7 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) * memory context that will not disappear at transaction commit. */ void -vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, +vacuum(List *relations, const VacuumParams params, BufferAccessStrategy bstrategy, MemoryContext vac_context, bool isTopLevel) { static bool in_vacuum = false; @@ -503,9 +503,7 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, volatile bool in_outer_xact, use_own_xacts; - Assert(params != NULL); - - stmttype = (params->options & VACOPT_VACUUM) ? "VACUUM" : "ANALYZE"; + stmttype = (params.options & VACOPT_VACUUM) ? "VACUUM" : "ANALYZE"; /* * We cannot run VACUUM inside a user transaction block; if we were inside @@ -515,7 +513,7 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, * * ANALYZE (without VACUUM) can run either way. */ - if (params->options & VACOPT_VACUUM) + if (params.options & VACOPT_VACUUM) { PreventInTransactionBlock(isTopLevel, stmttype); in_outer_xact = false; @@ -538,7 +536,7 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, * Build list of relation(s) to process, putting any new data in * vac_context for safekeeping. */ - if (params->options & VACOPT_ONLY_DATABASE_STATS) + if (params.options & VACOPT_ONLY_DATABASE_STATS) { /* We don't process any tables in this case */ Assert(relations == NIL); @@ -554,7 +552,7 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, List *sublist; MemoryContext old_context; - sublist = expand_vacuum_rel(vrel, vac_context, params->options); + sublist = expand_vacuum_rel(vrel, vac_context, params.options); old_context = MemoryContextSwitchTo(vac_context); newrels = list_concat(newrels, sublist); MemoryContextSwitchTo(old_context); @@ -562,7 +560,7 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, relations = newrels; } else - relations = get_all_vacuum_rels(vac_context, params->options); + relations = get_all_vacuum_rels(vac_context, params.options); /* * Decide whether we need to start/commit our own transactions. @@ -578,11 +576,11 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, * transaction block, and also in an autovacuum worker, use own * transactions so we can release locks sooner. */ - if (params->options & VACOPT_VACUUM) + if (params.options & VACOPT_VACUUM) use_own_xacts = true; else { - Assert(params->options & VACOPT_ANALYZE); + Assert(params.options & VACOPT_ANALYZE); if (AmAutoVacuumWorkerProcess()) use_own_xacts = true; else if (in_outer_xact) @@ -633,21 +631,13 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, { VacuumRelation *vrel = lfirst_node(VacuumRelation, cur); - if (params->options & VACOPT_VACUUM) + if (params.options & VACOPT_VACUUM) { - VacuumParams params_copy; - - /* - * vacuum_rel() scribbles on the parameters, so give it a copy - * to avoid affecting other relations. - */ - memcpy(¶ms_copy, params, sizeof(VacuumParams)); - - if (!vacuum_rel(vrel->oid, vrel->relation, ¶ms_copy, bstrategy)) + if (!vacuum_rel(vrel->oid, vrel->relation, params, bstrategy)) continue; } - if (params->options & VACOPT_ANALYZE) + if (params.options & VACOPT_ANALYZE) { /* * If using separate xacts, start one for analyze. Otherwise, @@ -711,8 +701,8 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, StartTransactionCommand(); } - if ((params->options & VACOPT_VACUUM) && - !(params->options & VACOPT_SKIP_DATABASE_STATS)) + if ((params.options & VACOPT_VACUUM) && + !(params.options & VACOPT_SKIP_DATABASE_STATS)) { /* * Update pg_database.datfrozenxid, and truncate pg_xact if possible. @@ -1110,7 +1100,7 @@ get_all_vacuum_rels(MemoryContext vac_context, int options) * minimum). */ bool -vacuum_get_cutoffs(Relation rel, const VacuumParams *params, +vacuum_get_cutoffs(Relation rel, const VacuumParams params, struct VacuumCutoffs *cutoffs) { int freeze_min_age, @@ -1126,10 +1116,10 @@ vacuum_get_cutoffs(Relation rel, const VacuumParams *params, aggressiveMXIDCutoff; /* Use mutable copies of freeze age parameters */ - freeze_min_age = params->freeze_min_age; - multixact_freeze_min_age = params->multixact_freeze_min_age; - freeze_table_age = params->freeze_table_age; - multixact_freeze_table_age = params->multixact_freeze_table_age; + freeze_min_age = params.freeze_min_age; + multixact_freeze_min_age = params.multixact_freeze_min_age; + freeze_table_age = params.freeze_table_age; + multixact_freeze_table_age = params.multixact_freeze_table_age; /* Set pg_class fields in cutoffs */ cutoffs->relfrozenxid = rel->rd_rel->relfrozenxid; @@ -2006,7 +1996,7 @@ vac_truncate_clog(TransactionId frozenXID, * At entry and exit, we are not inside a transaction. */ static bool -vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, +vacuum_rel(Oid relid, RangeVar *relation, VacuumParams params, BufferAccessStrategy bstrategy) { LOCKMODE lmode; @@ -2019,18 +2009,16 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, int save_nestlevel; VacuumParams toast_vacuum_params; - Assert(params != NULL); - /* * This function scribbles on the parameters, so make a copy early to * avoid affecting the TOAST table (if we do end up recursing to it). */ - memcpy(&toast_vacuum_params, params, sizeof(VacuumParams)); + memcpy(&toast_vacuum_params, ¶ms, sizeof(VacuumParams)); /* Begin a transaction for vacuuming this relation */ StartTransactionCommand(); - if (!(params->options & VACOPT_FULL)) + if (!(params.options & VACOPT_FULL)) { /* * In lazy vacuum, we can set the PROC_IN_VACUUM flag, which lets @@ -2056,7 +2044,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, */ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); MyProc->statusFlags |= PROC_IN_VACUUM; - if (params->is_wraparound) + if (params.is_wraparound) MyProc->statusFlags |= PROC_VACUUM_FOR_WRAPAROUND; ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags; LWLockRelease(ProcArrayLock); @@ -2080,12 +2068,12 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, * vacuum, but just ShareUpdateExclusiveLock for concurrent vacuum. Either * way, we can be sure that no other backend is vacuuming the same table. */ - lmode = (params->options & VACOPT_FULL) ? + lmode = (params.options & VACOPT_FULL) ? AccessExclusiveLock : ShareUpdateExclusiveLock; /* open the relation and get the appropriate lock on it */ - rel = vacuum_open_relation(relid, relation, params->options, - params->log_min_duration >= 0, lmode); + rel = vacuum_open_relation(relid, relation, params.options, + params.log_min_duration >= 0, lmode); /* leave if relation could not be opened or locked */ if (!rel) @@ -2100,8 +2088,8 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, * This is only safe to do because we hold a session lock on the main * relation that prevents concurrent deletion. */ - if (OidIsValid(params->toast_parent)) - priv_relid = params->toast_parent; + if (OidIsValid(params.toast_parent)) + priv_relid = params.toast_parent; else priv_relid = RelationGetRelid(rel); @@ -2114,7 +2102,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, */ if (!vacuum_is_permitted_for_relation(priv_relid, rel->rd_rel, - params->options & ~VACOPT_ANALYZE)) + params.options & ~VACOPT_ANALYZE)) { relation_close(rel, lmode); PopActiveSnapshot(); @@ -2185,7 +2173,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, * Set index_cleanup option based on index_cleanup reloption if it wasn't * specified in VACUUM command, or when running in an autovacuum worker */ - if (params->index_cleanup == VACOPTVALUE_UNSPECIFIED) + if (params.index_cleanup == VACOPTVALUE_UNSPECIFIED) { StdRdOptIndexCleanup vacuum_index_cleanup; @@ -2196,23 +2184,23 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, ((StdRdOptions *) rel->rd_options)->vacuum_index_cleanup; if (vacuum_index_cleanup == STDRD_OPTION_VACUUM_INDEX_CLEANUP_AUTO) - params->index_cleanup = VACOPTVALUE_AUTO; + params.index_cleanup = VACOPTVALUE_AUTO; else if (vacuum_index_cleanup == STDRD_OPTION_VACUUM_INDEX_CLEANUP_ON) - params->index_cleanup = VACOPTVALUE_ENABLED; + params.index_cleanup = VACOPTVALUE_ENABLED; else { Assert(vacuum_index_cleanup == STDRD_OPTION_VACUUM_INDEX_CLEANUP_OFF); - params->index_cleanup = VACOPTVALUE_DISABLED; + params.index_cleanup = VACOPTVALUE_DISABLED; } } #ifdef USE_INJECTION_POINTS - if (params->index_cleanup == VACOPTVALUE_AUTO) + if (params.index_cleanup == VACOPTVALUE_AUTO) INJECTION_POINT("vacuum-index-cleanup-auto", NULL); - else if (params->index_cleanup == VACOPTVALUE_DISABLED) + else if (params.index_cleanup == VACOPTVALUE_DISABLED) INJECTION_POINT("vacuum-index-cleanup-disabled", NULL); - else if (params->index_cleanup == VACOPTVALUE_ENABLED) + else if (params.index_cleanup == VACOPTVALUE_ENABLED) INJECTION_POINT("vacuum-index-cleanup-enabled", NULL); #endif @@ -2222,36 +2210,36 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, */ if (rel->rd_options != NULL && ((StdRdOptions *) rel->rd_options)->vacuum_max_eager_freeze_failure_rate >= 0) - params->max_eager_freeze_failure_rate = + params.max_eager_freeze_failure_rate = ((StdRdOptions *) rel->rd_options)->vacuum_max_eager_freeze_failure_rate; /* * Set truncate option based on truncate reloption or GUC if it wasn't * specified in VACUUM command, or when running in an autovacuum worker */ - if (params->truncate == VACOPTVALUE_UNSPECIFIED) + if (params.truncate == VACOPTVALUE_UNSPECIFIED) { StdRdOptions *opts = (StdRdOptions *) rel->rd_options; if (opts && opts->vacuum_truncate_set) { if (opts->vacuum_truncate) - params->truncate = VACOPTVALUE_ENABLED; + params.truncate = VACOPTVALUE_ENABLED; else - params->truncate = VACOPTVALUE_DISABLED; + params.truncate = VACOPTVALUE_DISABLED; } else if (vacuum_truncate) - params->truncate = VACOPTVALUE_ENABLED; + params.truncate = VACOPTVALUE_ENABLED; else - params->truncate = VACOPTVALUE_DISABLED; + params.truncate = VACOPTVALUE_DISABLED; } #ifdef USE_INJECTION_POINTS - if (params->truncate == VACOPTVALUE_AUTO) + if (params.truncate == VACOPTVALUE_AUTO) INJECTION_POINT("vacuum-truncate-auto", NULL); - else if (params->truncate == VACOPTVALUE_DISABLED) + else if (params.truncate == VACOPTVALUE_DISABLED) INJECTION_POINT("vacuum-truncate-disabled", NULL); - else if (params->truncate == VACOPTVALUE_ENABLED) + else if (params.truncate == VACOPTVALUE_ENABLED) INJECTION_POINT("vacuum-truncate-enabled", NULL); #endif @@ -2261,9 +2249,9 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, * automatically rebuilt by cluster_rel so we shouldn't recurse to it, * unless PROCESS_MAIN is disabled. */ - if ((params->options & VACOPT_PROCESS_TOAST) != 0 && - ((params->options & VACOPT_FULL) == 0 || - (params->options & VACOPT_PROCESS_MAIN) == 0)) + if ((params.options & VACOPT_PROCESS_TOAST) != 0 && + ((params.options & VACOPT_FULL) == 0 || + (params.options & VACOPT_PROCESS_MAIN) == 0)) toast_relid = rel->rd_rel->reltoastrelid; else toast_relid = InvalidOid; @@ -2286,16 +2274,16 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, * table is required (e.g., PROCESS_TOAST is set), we force PROCESS_MAIN * to be set when we recurse to the TOAST table. */ - if (params->options & VACOPT_PROCESS_MAIN) + if (params.options & VACOPT_PROCESS_MAIN) { /* * Do the actual work --- either FULL or "lazy" vacuum */ - if (params->options & VACOPT_FULL) + if (params.options & VACOPT_FULL) { ClusterParams cluster_params = {0}; - if ((params->options & VACOPT_VERBOSE) != 0) + if ((params.options & VACOPT_VERBOSE) != 0) cluster_params.options |= CLUOPT_VERBOSE; /* VACUUM FULL is now a variant of CLUSTER; see cluster.c */ @@ -2342,7 +2330,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, toast_vacuum_params.options |= VACOPT_PROCESS_MAIN; toast_vacuum_params.toast_parent = relid; - vacuum_rel(toast_relid, NULL, &toast_vacuum_params, bstrategy); + vacuum_rel(toast_relid, NULL, toast_vacuum_params, bstrategy); } /* diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c index 8a72b5e70a4ec..1a37737d4a235 100644 --- a/src/backend/executor/execExprInterp.c +++ b/src/backend/executor/execExprInterp.c @@ -5228,7 +5228,6 @@ ExecEvalJsonCoercionFinish(ExprState *state, ExprEvalStep *op) * JsonBehavior expression. */ jsestate->escontext.error_occurred = false; - jsestate->escontext.error_occurred = false; jsestate->escontext.details_wanted = true; } } diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index bdf862b24062e..ca33a854278ed 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -279,7 +279,7 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo) * executor is performing an UPDATE that could not use an * optimization like heapam's HOT (in more general terms a * call to table_tuple_update() took place and set - * 'update_indexes' to TUUI_All). Receiving this hint makes + * 'update_indexes' to TU_All). Receiving this hint makes * us consider if we should pass down the 'indexUnchanged' * hint in turn. That's something that we figure out for * each index_insert() call iff 'update' is true. @@ -290,7 +290,7 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo) * HOT has been applied and any updated columns are indexed * only by summarizing indexes (or in more general terms a * call to table_tuple_update() took place and set - * 'update_indexes' to TUUI_Summarizing). We can (and must) + * 'update_indexes' to TU_Summarizing). We can (and must) * therefore only update the indexes that have * 'amsummarizing' = true. * diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index f3e77bda27906..f098a5557cf07 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -189,6 +189,7 @@ ExecSerializePlan(Plan *plan, EState *estate) pstmt->permInfos = estate->es_rteperminfos; pstmt->resultRelations = NIL; pstmt->appendRelations = NIL; + pstmt->planOrigin = PLAN_STMT_INTERNAL; /* * Transfer only parallel-safe subplans, leaving a NULL "hole" in the list diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 53ddd25c42db9..f262e7a66f771 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -670,7 +670,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_update_before_row) { if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo, - tid, NULL, slot, NULL, NULL)) + tid, NULL, slot, NULL, NULL, false)) skip_tuple = true; /* "do nothing" */ } @@ -746,7 +746,7 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_delete_before_row) { skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, - tid, NULL, NULL, NULL, NULL); + tid, NULL, NULL, NULL, NULL, false); } if (!skip_tuple) diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 54da8e7995bd3..7c6c2c1f6e42a 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -1474,7 +1474,8 @@ ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, return ExecBRDeleteTriggers(context->estate, context->epqstate, resultRelInfo, tupleid, oldtuple, - epqreturnslot, result, &context->tmfd); + epqreturnslot, result, &context->tmfd, + context->mtstate->operation == CMD_MERGE); } return true; @@ -2117,7 +2118,8 @@ ExecUpdatePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, return ExecBRUpdateTriggers(context->estate, context->epqstate, resultRelInfo, tupleid, oldtuple, slot, - result, &context->tmfd); + result, &context->tmfd, + context->mtstate->operation == CMD_MERGE); } return true; diff --git a/src/backend/jit/llvm/Makefile b/src/backend/jit/llvm/Makefile index e8c12060b93df..68677ba42e189 100644 --- a/src/backend/jit/llvm/Makefile +++ b/src/backend/jit/llvm/Makefile @@ -31,7 +31,7 @@ endif # All files in this directory use LLVM. CFLAGS += $(LLVM_CFLAGS) CXXFLAGS += $(LLVM_CXXFLAGS) -override CPPFLAGS := $(LLVM_CPPFLAGS) $(CPPFLAGS) +override CPPFLAGS += $(LLVM_CPPFLAGS) SHLIB_LINK += $(LLVM_LIBS) # Because this module includes C++ files, we need to use a C++ diff --git a/src/backend/jit/llvm/meson.build b/src/backend/jit/llvm/meson.build index c8e06dfbe351b..805fbd6900679 100644 --- a/src/backend/jit/llvm/meson.build +++ b/src/backend/jit/llvm/meson.build @@ -53,7 +53,7 @@ llvm_irgen_args = [ if ccache.found() llvm_irgen_command = ccache - llvm_irgen_args = [clang.path()] + llvm_irgen_args + llvm_irgen_args = [clang.full_path()] + llvm_irgen_args else llvm_irgen_command = clang endif diff --git a/src/backend/libpq/auth.c b/src/backend/libpq/auth.c index 9f4d05ffbd453..4da46666439db 100644 --- a/src/backend/libpq/auth.c +++ b/src/backend/libpq/auth.c @@ -94,8 +94,16 @@ static int auth_peer(hbaPort *port); #define PGSQL_PAM_SERVICE "postgresql" /* Service name passed to PAM */ +/* Work around original Solaris' lack of "const" in the conv_proc signature */ +#ifdef _PAM_LEGACY_NONCONST +#define PG_PAM_CONST +#else +#define PG_PAM_CONST const +#endif + static int CheckPAMAuth(Port *port, const char *user, const char *password); -static int pam_passwd_conv_proc(int num_msg, const struct pam_message **msg, +static int pam_passwd_conv_proc(int num_msg, + PG_PAM_CONST struct pam_message **msg, struct pam_response **resp, void *appdata_ptr); static struct pam_conv pam_passw_conv = { @@ -1917,7 +1925,7 @@ auth_peer(hbaPort *port) */ static int -pam_passwd_conv_proc(int num_msg, const struct pam_message **msg, +pam_passwd_conv_proc(int num_msg, PG_PAM_CONST struct pam_message **msg, struct pam_response **resp, void *appdata_ptr) { const char *passwd; diff --git a/src/backend/libpq/hba.c b/src/backend/libpq/hba.c index 332fad278351c..fecee8224d075 100644 --- a/src/backend/libpq/hba.c +++ b/src/backend/libpq/hba.c @@ -2873,8 +2873,11 @@ check_ident_usermap(IdentLine *identLine, const char *usermap_name, !token_has_regexp(identLine->pg_user) && (ofs = strstr(identLine->pg_user->string, "\\1")) != NULL) { + const char *repl_str; + size_t repl_len; + char *old_pg_user; char *expanded_pg_user; - int offset; + size_t offset; /* substitution of the first argument requested */ if (matches[1].rm_so < 0) @@ -2886,18 +2889,33 @@ check_ident_usermap(IdentLine *identLine, const char *usermap_name, *error_p = true; return; } + repl_str = system_user + matches[1].rm_so; + repl_len = matches[1].rm_eo - matches[1].rm_so; /* - * length: original length minus length of \1 plus length of match - * plus null terminator + * It's allowed to have more than one \1 in the string, and we'll + * replace them all. But that's pretty unusual so we optimize on + * the assumption of only one occurrence, which motivates doing + * repeated replacements instead of making two passes over the + * string to determine the final length right away. */ - expanded_pg_user = palloc0(strlen(identLine->pg_user->string) - 2 + (matches[1].rm_eo - matches[1].rm_so) + 1); - offset = ofs - identLine->pg_user->string; - memcpy(expanded_pg_user, identLine->pg_user->string, offset); - memcpy(expanded_pg_user + offset, - system_user + matches[1].rm_so, - matches[1].rm_eo - matches[1].rm_so); - strcat(expanded_pg_user, ofs + 2); + old_pg_user = identLine->pg_user->string; + do + { + /* + * length: current length minus length of \1 plus length of + * replacement plus null terminator + */ + expanded_pg_user = palloc(strlen(old_pg_user) - 2 + repl_len + 1); + /* ofs points into the old_pg_user string at this point */ + offset = ofs - old_pg_user; + memcpy(expanded_pg_user, old_pg_user, offset); + memcpy(expanded_pg_user + offset, repl_str, repl_len); + strcpy(expanded_pg_user + offset + repl_len, ofs + 2); + if (old_pg_user != identLine->pg_user->string) + pfree(old_pg_user); + old_pg_user = expanded_pg_user; + } while ((ofs = strstr(old_pg_user + offset + repl_len, "\\1")) != NULL); /* * Mark the token as quoted, so it will only be compared literally diff --git a/src/backend/libpq/pg_ident.conf.sample b/src/backend/libpq/pg_ident.conf.sample index f5225f26cdf2c..8ee6c0ba31576 100644 --- a/src/backend/libpq/pg_ident.conf.sample +++ b/src/backend/libpq/pg_ident.conf.sample @@ -13,25 +13,25 @@ # user names to their corresponding PostgreSQL user names. Records # are of the form: # -# MAPNAME SYSTEM-USERNAME PG-USERNAME +# MAPNAME SYSTEM-USERNAME DATABASE-USERNAME # # (The uppercase quantities must be replaced by actual values.) # # MAPNAME is the (otherwise freely chosen) map name that was used in # pg_hba.conf. SYSTEM-USERNAME is the detected user name of the -# client. PG-USERNAME is the requested PostgreSQL user name. The -# existence of a record specifies that SYSTEM-USERNAME may connect as -# PG-USERNAME. +# client. DATABASE-USERNAME is the requested PostgreSQL user name. +# The existence of a record specifies that SYSTEM-USERNAME may connect +# as DATABASE-USERNAME. # -# If SYSTEM-USERNAME starts with a slash (/), it will be treated as a -# regular expression. Optionally this can contain a capture (a -# parenthesized subexpression). The substring matching the capture -# will be substituted for \1 (backslash-one) if present in -# PG-USERNAME. +# If SYSTEM-USERNAME starts with a slash (/), the rest of it will be +# treated as a regular expression. Optionally this can contain a capture +# (a parenthesized subexpression). The substring matching the capture +# will be substituted for \1 (backslash-one) if that appears in +# DATABASE-USERNAME. # -# PG-USERNAME can be "all", a user name, a group name prefixed with "+", or -# a regular expression (if it starts with a slash (/)). If it is a regular -# expression, the substring matching with \1 has no effect. +# DATABASE-USERNAME can be "all", a user name, a group name prefixed with "+", +# or a regular expression (if it starts with a slash (/)). If it is a regular +# expression, no substitution for \1 will occur. # # Multiple maps may be specified in this file and used by pg_hba.conf. # @@ -69,4 +69,4 @@ # Put your actual configuration here # ---------------------------------- -# MAPNAME SYSTEM-USERNAME PG-USERNAME +# MAPNAME SYSTEM-USERNAME DATABASE-USERNAME diff --git a/src/backend/main/main.c b/src/backend/main/main.c index 7d63cf94a6b44..bdcb5e4f26159 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -125,13 +125,17 @@ main(int argc, char *argv[]) set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("postgres")); /* - * In the postmaster, absorb the environment values for LC_COLLATE and - * LC_CTYPE. Individual backends will change these later to settings - * taken from pg_database, but the postmaster cannot do that. If we leave - * these set to "C" then message localization might not work well in the - * postmaster. + * Collation is handled by pg_locale.c, and the behavior is dependent on + * the provider. strcoll(), etc., should not be called directly. + */ + init_locale("LC_COLLATE", LC_COLLATE, "C"); + + /* + * In the postmaster, absorb the environment value for LC_CTYPE. + * Individual backends will change it later to pg_database.datctype, but + * the postmaster cannot do that. If we leave it set to "C" then message + * localization might not work well in the postmaster. */ - init_locale("LC_COLLATE", LC_COLLATE, ""); init_locale("LC_CTYPE", LC_CTYPE, ""); /* diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 3d44815ed5adf..344a3188317b1 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -2247,7 +2247,7 @@ append_nonpartial_cost(List *subpaths, int numpaths, int parallel_workers) * Determines and returns the cost of an Append node. */ void -cost_append(AppendPath *apath) +cost_append(AppendPath *apath, PlannerInfo *root) { ListCell *l; @@ -2309,26 +2309,52 @@ cost_append(AppendPath *apath) foreach(l, apath->subpaths) { Path *subpath = (Path *) lfirst(l); - Path sort_path; /* dummy for result of cost_sort */ + int presorted_keys; + Path sort_path; /* dummy for result of + * cost_sort/cost_incremental_sort */ - if (!pathkeys_contained_in(pathkeys, subpath->pathkeys)) + if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys, + &presorted_keys)) { /* * We'll need to insert a Sort node, so include costs for - * that. We can use the parent's LIMIT if any, since we + * that. We choose to use incremental sort if it is + * enabled and there are presorted keys; otherwise we use + * full sort. + * + * We can use the parent's LIMIT if any, since we * certainly won't pull more than that many tuples from * any child. */ - cost_sort(&sort_path, - NULL, /* doesn't currently need root */ - pathkeys, - subpath->disabled_nodes, - subpath->total_cost, - subpath->rows, - subpath->pathtarget->width, - 0.0, - work_mem, - apath->limit_tuples); + if (enable_incremental_sort && presorted_keys > 0) + { + cost_incremental_sort(&sort_path, + root, + pathkeys, + presorted_keys, + subpath->disabled_nodes, + subpath->startup_cost, + subpath->total_cost, + subpath->rows, + subpath->pathtarget->width, + 0.0, + work_mem, + apath->limit_tuples); + } + else + { + cost_sort(&sort_path, + root, + pathkeys, + subpath->disabled_nodes, + subpath->total_cost, + subpath->rows, + subpath->pathtarget->width, + 0.0, + work_mem, + apath->limit_tuples); + } + subpath = &sort_path; } @@ -2546,13 +2572,13 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath, Cost input_startup_cost = mpath->subpath->startup_cost; Cost input_total_cost = mpath->subpath->total_cost; double tuples = mpath->subpath->rows; - double calls = mpath->calls; + Cardinality est_calls = mpath->est_calls; int width = mpath->subpath->pathtarget->width; double hash_mem_bytes; double est_entry_bytes; - double est_cache_entries; - double ndistinct; + Cardinality est_cache_entries; + Cardinality ndistinct; double evict_ratio; double hit_ratio; Cost startup_cost; @@ -2578,7 +2604,7 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath, est_cache_entries = floor(hash_mem_bytes / est_entry_bytes); /* estimate on the distinct number of parameter values */ - ndistinct = estimate_num_groups(root, mpath->param_exprs, calls, NULL, + ndistinct = estimate_num_groups(root, mpath->param_exprs, est_calls, NULL, &estinfo); /* @@ -2590,7 +2616,10 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath, * certainly mean a MemoizePath will never survive add_path(). */ if ((estinfo.flags & SELFLAG_USED_DEFAULT) != 0) - ndistinct = calls; + ndistinct = est_calls; + + /* Remember the ndistinct estimate for EXPLAIN */ + mpath->est_unique_keys = ndistinct; /* * Since we've already estimated the maximum number of entries we can @@ -2618,9 +2647,12 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath, * must look at how many scans are estimated in total for this node and * how many of those scans we expect to get a cache hit. */ - hit_ratio = ((calls - ndistinct) / calls) * + hit_ratio = ((est_calls - ndistinct) / est_calls) * (est_cache_entries / Max(ndistinct, est_cache_entries)); + /* Remember the hit ratio estimate for EXPLAIN */ + mpath->est_hit_ratio = hit_ratio; + Assert(hit_ratio >= 0 && hit_ratio <= 1.0); /* diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index 7aa8f5d799cac..ebedc5574ca9c 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -154,13 +154,17 @@ add_paths_to_joinrel(PlannerInfo *root, /* * See if the inner relation is provably unique for this outer rel. * - * We have some special cases: for JOIN_SEMI and JOIN_ANTI, it doesn't - * matter since the executor can make the equivalent optimization anyway; - * we need not expend planner cycles on proofs. For JOIN_UNIQUE_INNER, we - * must be considering a semijoin whose inner side is not provably unique - * (else reduce_unique_semijoins would've simplified it), so there's no - * point in calling innerrel_is_unique. However, if the LHS covers all of - * the semijoin's min_lefthand, then it's appropriate to set inner_unique + * We have some special cases: for JOIN_SEMI, it doesn't matter since the + * executor can make the equivalent optimization anyway. It also doesn't + * help enable use of Memoize, since a semijoin with a provably unique + * inner side should have been reduced to an inner join in that case. + * Therefore, we need not expend planner cycles on proofs. (For + * JOIN_ANTI, although it doesn't help the executor for the same reason, + * it can benefit Memoize paths.) For JOIN_UNIQUE_INNER, we must be + * considering a semijoin whose inner side is not provably unique (else + * reduce_unique_semijoins would've simplified it), so there's no point in + * calling innerrel_is_unique. However, if the LHS covers all of the + * semijoin's min_lefthand, then it's appropriate to set inner_unique * because the path produced by create_unique_path will be unique relative * to the LHS. (If we have an LHS that's only part of the min_lefthand, * that is *not* true.) For JOIN_UNIQUE_OUTER, pass JOIN_INNER to avoid @@ -169,12 +173,6 @@ add_paths_to_joinrel(PlannerInfo *root, switch (jointype) { case JOIN_SEMI: - case JOIN_ANTI: - - /* - * XXX it may be worth proving this to allow a Memoize to be - * considered for Nested Loop Semi/Anti Joins. - */ extra.inner_unique = false; /* well, unproven */ break; case JOIN_UNIQUE_INNER: @@ -715,16 +713,21 @@ get_memoize_path(PlannerInfo *root, RelOptInfo *innerrel, return NULL; /* - * Currently we don't do this for SEMI and ANTI joins unless they're - * marked as inner_unique. This is because nested loop SEMI/ANTI joins - * don't scan the inner node to completion, which will mean memoize cannot - * mark the cache entry as complete. - * - * XXX Currently we don't attempt to mark SEMI/ANTI joins as inner_unique - * = true. Should we? See add_paths_to_joinrel() + * Currently we don't do this for SEMI and ANTI joins, because nested loop + * SEMI/ANTI joins don't scan the inner node to completion, which means + * memoize cannot mark the cache entry as complete. Nor can we mark the + * cache entry as complete after fetching the first inner tuple, because + * if that tuple and the current outer tuple don't satisfy the join + * clauses, a second inner tuple that satisfies the parameters would find + * the cache entry already marked as complete. The only exception is when + * the inner relation is provably unique, as in that case, there won't be + * a second matching tuple and we can safely mark the cache entry as + * complete after fetching the first inner tuple. Note that in such + * cases, the SEMI join should have been reduced to an inner join by + * reduce_unique_semijoins. */ - if (!extra->inner_unique && (jointype == JOIN_SEMI || - jointype == JOIN_ANTI)) + if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) && + !extra->inner_unique) return NULL; /* diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 0b61aef962c6d..bfefc7dbea106 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -284,7 +284,10 @@ static Material *make_material(Plan *lefttree); static Memoize *make_memoize(Plan *lefttree, Oid *hashoperators, Oid *collations, List *param_exprs, bool singlerow, bool binary_mode, - uint32 est_entries, Bitmapset *keyparamids); + uint32 est_entries, Bitmapset *keyparamids, + Cardinality est_calls, + Cardinality est_unique_keys, + double est_hit_ratio); static WindowAgg *make_windowagg(List *tlist, WindowClause *wc, int partNumCols, AttrNumber *partColIdx, Oid *partOperators, Oid *partCollations, int ordNumCols, AttrNumber *ordColIdx, Oid *ordOperators, Oid *ordCollations, @@ -1318,6 +1321,7 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path, int flags) Oid *sortOperators; Oid *collations; bool *nullsFirst; + int presorted_keys; /* * Compute sort column info, and adjust subplan's tlist as needed. @@ -1353,14 +1357,38 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path, int flags) numsortkeys * sizeof(bool)) == 0); /* Now, insert a Sort node if subplan isn't sufficiently ordered */ - if (!pathkeys_contained_in(pathkeys, subpath->pathkeys)) + if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys, + &presorted_keys)) { - Sort *sort = make_sort(subplan, numsortkeys, + Plan *sort_plan; + + /* + * We choose to use incremental sort if it is enabled and + * there are presorted keys; otherwise we use full sort. + */ + if (enable_incremental_sort && presorted_keys > 0) + { + sort_plan = (Plan *) + make_incrementalsort(subplan, numsortkeys, presorted_keys, sortColIdx, sortOperators, collations, nullsFirst); - label_sort_with_costsize(root, sort, best_path->limit_tuples); - subplan = (Plan *) sort; + label_incrementalsort_with_costsize(root, + (IncrementalSort *) sort_plan, + pathkeys, + best_path->limit_tuples); + } + else + { + sort_plan = (Plan *) make_sort(subplan, numsortkeys, + sortColIdx, sortOperators, + collations, nullsFirst); + + label_sort_with_costsize(root, (Sort *) sort_plan, + best_path->limit_tuples); + } + + subplan = sort_plan; } } @@ -1491,6 +1519,7 @@ create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path, Oid *sortOperators; Oid *collations; bool *nullsFirst; + int presorted_keys; /* Build the child plan */ /* Must insist that all children return the same tlist */ @@ -1525,14 +1554,38 @@ create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path, numsortkeys * sizeof(bool)) == 0); /* Now, insert a Sort node if subplan isn't sufficiently ordered */ - if (!pathkeys_contained_in(pathkeys, subpath->pathkeys)) + if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys, + &presorted_keys)) { - Sort *sort = make_sort(subplan, numsortkeys, + Plan *sort_plan; + + /* + * We choose to use incremental sort if it is enabled and there + * are presorted keys; otherwise we use full sort. + */ + if (enable_incremental_sort && presorted_keys > 0) + { + sort_plan = (Plan *) + make_incrementalsort(subplan, numsortkeys, presorted_keys, sortColIdx, sortOperators, collations, nullsFirst); - label_sort_with_costsize(root, sort, best_path->limit_tuples); - subplan = (Plan *) sort; + label_incrementalsort_with_costsize(root, + (IncrementalSort *) sort_plan, + pathkeys, + best_path->limit_tuples); + } + else + { + sort_plan = (Plan *) make_sort(subplan, numsortkeys, + sortColIdx, sortOperators, + collations, nullsFirst); + + label_sort_with_costsize(root, (Sort *) sort_plan, + best_path->limit_tuples); + } + + subplan = sort_plan; } subplans = lappend(subplans, subplan); @@ -1703,7 +1756,8 @@ create_memoize_plan(PlannerInfo *root, MemoizePath *best_path, int flags) plan = make_memoize(subplan, operators, collations, param_exprs, best_path->singlerow, best_path->binary_mode, - best_path->est_entries, keyparamids); + best_path->est_entries, keyparamids, best_path->est_calls, + best_path->est_unique_keys, best_path->est_hit_ratio); copy_generic_path_info(&plan->plan, (Path *) best_path); @@ -6699,7 +6753,9 @@ materialize_finished_plan(Plan *subplan) static Memoize * make_memoize(Plan *lefttree, Oid *hashoperators, Oid *collations, List *param_exprs, bool singlerow, bool binary_mode, - uint32 est_entries, Bitmapset *keyparamids) + uint32 est_entries, Bitmapset *keyparamids, + Cardinality est_calls, Cardinality est_unique_keys, + double est_hit_ratio) { Memoize *node = makeNode(Memoize); Plan *plan = &node->plan; @@ -6717,6 +6773,9 @@ make_memoize(Plan *lefttree, Oid *hashoperators, Oid *collations, node->binary_mode = binary_mode; node->est_entries = est_entries; node->keyparamids = keyparamids; + node->est_calls = est_calls; + node->est_unique_keys = est_unique_keys; + node->est_hit_ratio = est_hit_ratio; return node; } diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c index 01804b085b3ba..3e3fec8925291 100644 --- a/src/backend/optimizer/plan/initsplan.c +++ b/src/backend/optimizer/plan/initsplan.c @@ -3048,36 +3048,16 @@ add_base_clause_to_rel(PlannerInfo *root, Index relid, * expr_is_nonnullable * Check to see if the Expr cannot be NULL * - * If the Expr is a simple Var that is defined NOT NULL and meanwhile is not - * nulled by any outer joins, then we can know that it cannot be NULL. + * Currently we only support simple Vars. */ static bool expr_is_nonnullable(PlannerInfo *root, Expr *expr) { - RelOptInfo *rel; - Var *var; - /* For now only check simple Vars */ if (!IsA(expr, Var)) return false; - var = (Var *) expr; - - /* could the Var be nulled by any outer joins? */ - if (!bms_is_empty(var->varnullingrels)) - return false; - - /* system columns cannot be NULL */ - if (var->varattno < 0) - return true; - - /* is the column defined NOT NULL? */ - rel = find_base_rel(root, var->varno); - if (var->varattno > 0 && - bms_is_member(var->varattno, rel->notnullattnums)) - return true; - - return false; + return var_is_nonnullable(root, (Var *) expr, true); } /* diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 549aedcfa991a..d59d6e4c6a021 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -342,6 +342,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, glob->transientPlan = false; glob->dependsOnRole = false; glob->partition_directory = NULL; + glob->rel_notnullatts_hash = NULL; /* * Assess whether it's feasible to use parallel mode for this query. We @@ -557,6 +558,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, result->commandType = parse->commandType; result->queryId = parse->queryId; + result->planOrigin = PLAN_STMT_STANDARD; result->hasReturning = (parse->returningList != NIL); result->hasModifyingCTE = parse->hasModifyingCTE; result->canSetTag = parse->canSetTag; @@ -720,6 +722,18 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, */ transform_MERGE_to_join(parse); + /* + * Scan the rangetable for relation RTEs and retrieve the necessary + * catalog information for each relation. Using this information, clear + * the inh flag for any relation that has no children, collect not-null + * attribute numbers for any relation that has column not-null + * constraints, and expand virtual generated columns for any relation that + * contains them. Note that this step does not descend into sublinks and + * subqueries; if we pull up any sublinks or subqueries below, their + * relation RTEs are processed just before pulling them up. + */ + parse = root->parse = preprocess_relation_rtes(root); + /* * If the FROM clause is empty, replace it with a dummy RTE_RESULT RTE, so * that we don't need so many special cases to deal with that situation. @@ -743,14 +757,6 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, */ preprocess_function_rtes(root); - /* - * Scan the rangetable for relations with virtual generated columns, and - * replace all Var nodes in the query that reference these columns with - * the generation expressions. Recursion issues here are handled in the - * same way as for SubLinks. - */ - parse = root->parse = expand_virtual_generated_columns(root); - /* * Check to see if any subqueries in the jointree can be merged into this * query. @@ -787,23 +793,6 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, switch (rte->rtekind) { - case RTE_RELATION: - if (rte->inh) - { - /* - * Check to see if the relation actually has any children; - * if not, clear the inh flag so we can treat it as a - * plain base relation. - * - * Note: this could give a false-positive result, if the - * rel once had children but no longer does. We used to - * be able to clear rte->inh later on when we discovered - * that, but no more; we have to handle such cases as - * full-fledged inheritance. - */ - rte->inh = has_subclass(rte->relid); - } - break; case RTE_JOIN: root->hasJoinRTEs = true; if (IS_OUTER_JOIN(rte->jointype)) diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index e7cb3fede6658..d71ed958e31b3 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -1454,6 +1454,7 @@ convert_EXISTS_sublink_to_join(PlannerInfo *root, SubLink *sublink, Query *parse = root->parse; Query *subselect = (Query *) sublink->subselect; Node *whereClause; + PlannerInfo subroot; int rtoffset; int varno; Relids clause_varnos; @@ -1515,6 +1516,35 @@ convert_EXISTS_sublink_to_join(PlannerInfo *root, SubLink *sublink, if (contain_volatile_functions(whereClause)) return NULL; + /* + * Scan the rangetable for relation RTEs and retrieve the necessary + * catalog information for each relation. Using this information, clear + * the inh flag for any relation that has no children, collect not-null + * attribute numbers for any relation that has column not-null + * constraints, and expand virtual generated columns for any relation that + * contains them. + * + * Note: we construct up an entirely dummy PlannerInfo for use here. This + * is fine because only the "glob" and "parse" links will be used in this + * case. + * + * Note: we temporarily assign back the WHERE clause so that any virtual + * generated column references within it can be expanded. It should be + * separated out again afterward. + */ + MemSet(&subroot, 0, sizeof(subroot)); + subroot.type = T_PlannerInfo; + subroot.glob = root->glob; + subroot.parse = subselect; + subselect->jointree->quals = whereClause; + subselect = preprocess_relation_rtes(&subroot); + + /* + * Now separate out the WHERE clause again. + */ + whereClause = subselect->jointree->quals; + subselect->jointree->quals = NULL; + /* * The subquery must have a nonempty jointree, but we can make it so. */ @@ -1732,6 +1762,7 @@ convert_EXISTS_to_ANY(PlannerInfo *root, Query *subselect, Node **testexpr, List **paramIds) { Node *whereClause; + PlannerInfo subroot; List *leftargs, *rightargs, *opids, @@ -1791,12 +1822,15 @@ convert_EXISTS_to_ANY(PlannerInfo *root, Query *subselect, * parent aliases were flattened already, and we're not going to pull any * child Vars (of any description) into the parent. * - * Note: passing the parent's root to eval_const_expressions is - * technically wrong, but we can get away with it since only the - * boundParams (if any) are used, and those would be the same in a - * subroot. - */ - whereClause = eval_const_expressions(root, whereClause); + * Note: we construct up an entirely dummy PlannerInfo to pass to + * eval_const_expressions. This is fine because only the "glob" and + * "parse" links are used by eval_const_expressions. + */ + MemSet(&subroot, 0, sizeof(subroot)); + subroot.type = T_PlannerInfo; + subroot.glob = root->glob; + subroot.parse = subselect; + whereClause = eval_const_expressions(&subroot, whereClause); whereClause = (Node *) canonicalize_qual((Expr *) whereClause, false); whereClause = (Node *) make_ands_implicit((Expr *) whereClause); diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c index 87dc6f56b576f..35e8d3c183b47 100644 --- a/src/backend/optimizer/prep/prepjointree.c +++ b/src/backend/optimizer/prep/prepjointree.c @@ -4,10 +4,10 @@ * Planner preprocessing for subqueries and join tree manipulation. * * NOTE: the intended sequence for invoking these operations is + * preprocess_relation_rtes * replace_empty_jointree * pull_up_sublinks * preprocess_function_rtes - * expand_virtual_generated_columns * pull_up_subqueries * flatten_simple_union_all * do expression preprocessing (including flattening JOIN alias vars) @@ -36,6 +36,7 @@ #include "optimizer/clauses.h" #include "optimizer/optimizer.h" #include "optimizer/placeholder.h" +#include "optimizer/plancat.h" #include "optimizer/prep.h" #include "optimizer/subselect.h" #include "optimizer/tlist.h" @@ -102,6 +103,9 @@ typedef struct reduce_outer_joins_partial_state Relids unreduced_side; /* relids in its still-nullable side */ } reduce_outer_joins_partial_state; +static Query *expand_virtual_generated_columns(PlannerInfo *root, Query *parse, + RangeTblEntry *rte, int rt_index, + Relation relation); static Node *pull_up_sublinks_jointree_recurse(PlannerInfo *root, Node *jtnode, Relids *relids); static Node *pull_up_sublinks_qual_recurse(PlannerInfo *root, Node *node, @@ -392,6 +396,181 @@ transform_MERGE_to_join(Query *parse) parse->mergeJoinCondition = NULL; /* join condition not needed */ } +/* + * preprocess_relation_rtes + * Do the preprocessing work for any relation RTEs in the FROM clause. + * + * This scans the rangetable for relation RTEs and retrieves the necessary + * catalog information for each relation. Using this information, it clears + * the inh flag for any relation that has no children, collects not-null + * attribute numbers for any relation that has column not-null constraints, and + * expands virtual generated columns for any relation that contains them. + * + * Note that expanding virtual generated columns may cause the query tree to + * have new copies of rangetable entries. Therefore, we have to use list_nth + * instead of foreach when iterating over the query's rangetable. + * + * Returns a modified copy of the query tree, if any relations with virtual + * generated columns are present. + */ +Query * +preprocess_relation_rtes(PlannerInfo *root) +{ + Query *parse = root->parse; + int rtable_size; + int rt_index; + + rtable_size = list_length(parse->rtable); + + for (rt_index = 0; rt_index < rtable_size; rt_index++) + { + RangeTblEntry *rte = rt_fetch(rt_index + 1, parse->rtable); + Relation relation; + + /* We only care about relation RTEs. */ + if (rte->rtekind != RTE_RELATION) + continue; + + /* + * We need not lock the relation since it was already locked by the + * rewriter. + */ + relation = table_open(rte->relid, NoLock); + + /* + * Check to see if the relation actually has any children; if not, + * clear the inh flag so we can treat it as a plain base relation. + * + * Note: this could give a false-positive result, if the rel once had + * children but no longer does. We used to be able to clear rte->inh + * later on when we discovered that, but no more; we have to handle + * such cases as full-fledged inheritance. + */ + if (rte->inh) + rte->inh = relation->rd_rel->relhassubclass; + + /* + * Check to see if the relation has any column not-null constraints; + * if so, retrieve the constraint information and store it in a + * relation OID based hash table. + */ + get_relation_notnullatts(root, relation); + + /* + * Check to see if the relation has any virtual generated columns; if + * so, replace all Var nodes in the query that reference these columns + * with the generation expressions. + */ + parse = expand_virtual_generated_columns(root, parse, + rte, rt_index + 1, + relation); + + table_close(relation, NoLock); + } + + return parse; +} + +/* + * expand_virtual_generated_columns + * Expand virtual generated columns for the given relation. + * + * This checks whether the given relation has any virtual generated columns, + * and if so, replaces all Var nodes in the query that reference those columns + * with their generation expressions. + * + * Returns a modified copy of the query tree if the relation contains virtual + * generated columns. + */ +static Query * +expand_virtual_generated_columns(PlannerInfo *root, Query *parse, + RangeTblEntry *rte, int rt_index, + Relation relation) +{ + TupleDesc tupdesc; + + /* Only normal relations can have virtual generated columns */ + Assert(rte->rtekind == RTE_RELATION); + + tupdesc = RelationGetDescr(relation); + if (tupdesc->constr && tupdesc->constr->has_generated_virtual) + { + List *tlist = NIL; + pullup_replace_vars_context rvcontext; + + for (int i = 0; i < tupdesc->natts; i++) + { + Form_pg_attribute attr = TupleDescAttr(tupdesc, i); + TargetEntry *tle; + + if (attr->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) + { + Node *defexpr; + + defexpr = build_generation_expression(relation, i + 1); + ChangeVarNodes(defexpr, 1, rt_index, 0); + + tle = makeTargetEntry((Expr *) defexpr, i + 1, 0, false); + tlist = lappend(tlist, tle); + } + else + { + Var *var; + + var = makeVar(rt_index, + i + 1, + attr->atttypid, + attr->atttypmod, + attr->attcollation, + 0); + + tle = makeTargetEntry((Expr *) var, i + 1, 0, false); + tlist = lappend(tlist, tle); + } + } + + Assert(list_length(tlist) > 0); + Assert(!rte->lateral); + + /* + * The relation's targetlist items are now in the appropriate form to + * insert into the query, except that we may need to wrap them in + * PlaceHolderVars. Set up required context data for + * pullup_replace_vars. + */ + rvcontext.root = root; + rvcontext.targetlist = tlist; + rvcontext.target_rte = rte; + rvcontext.result_relation = parse->resultRelation; + /* won't need these values */ + rvcontext.relids = NULL; + rvcontext.nullinfo = NULL; + /* pass NULL for outer_hasSubLinks */ + rvcontext.outer_hasSubLinks = NULL; + rvcontext.varno = rt_index; + /* this flag will be set below, if needed */ + rvcontext.wrap_option = REPLACE_WRAP_NONE; + /* initialize cache array with indexes 0 .. length(tlist) */ + rvcontext.rv_cache = palloc0((list_length(tlist) + 1) * + sizeof(Node *)); + + /* + * If the query uses grouping sets, we need a PlaceHolderVar for each + * expression of the relation's targetlist items. (See comments in + * pull_up_simple_subquery().) + */ + if (parse->groupingSets) + rvcontext.wrap_option = REPLACE_WRAP_ALL; + + /* + * Apply pullup variable replacement throughout the query tree. + */ + parse = (Query *) pullup_replace_vars((Node *) parse, &rvcontext); + } + + return parse; +} + /* * replace_empty_jointree * If the Query's jointree is empty, replace it with a dummy RTE_RESULT @@ -949,128 +1128,6 @@ preprocess_function_rtes(PlannerInfo *root) } } -/* - * expand_virtual_generated_columns - * Expand all virtual generated column references in a query. - * - * This scans the rangetable for relations with virtual generated columns, and - * replaces all Var nodes in the query that reference these columns with the - * generation expressions. Note that we do not descend into subqueries; that - * is taken care of when the subqueries are planned. - * - * This has to be done after we have pulled up any SubLinks within the query's - * quals; otherwise any virtual generated column references within the SubLinks - * that should be transformed into joins wouldn't get expanded. - * - * Returns a modified copy of the query tree, if any relations with virtual - * generated columns are present. - */ -Query * -expand_virtual_generated_columns(PlannerInfo *root) -{ - Query *parse = root->parse; - int rt_index; - ListCell *lc; - - rt_index = 0; - foreach(lc, parse->rtable) - { - RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc); - Relation rel; - TupleDesc tupdesc; - - ++rt_index; - - /* - * Only normal relations can have virtual generated columns. - */ - if (rte->rtekind != RTE_RELATION) - continue; - - rel = table_open(rte->relid, NoLock); - - tupdesc = RelationGetDescr(rel); - if (tupdesc->constr && tupdesc->constr->has_generated_virtual) - { - List *tlist = NIL; - pullup_replace_vars_context rvcontext; - - for (int i = 0; i < tupdesc->natts; i++) - { - Form_pg_attribute attr = TupleDescAttr(tupdesc, i); - TargetEntry *tle; - - if (attr->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) - { - Node *defexpr; - - defexpr = build_generation_expression(rel, i + 1); - ChangeVarNodes(defexpr, 1, rt_index, 0); - - tle = makeTargetEntry((Expr *) defexpr, i + 1, 0, false); - tlist = lappend(tlist, tle); - } - else - { - Var *var; - - var = makeVar(rt_index, - i + 1, - attr->atttypid, - attr->atttypmod, - attr->attcollation, - 0); - - tle = makeTargetEntry((Expr *) var, i + 1, 0, false); - tlist = lappend(tlist, tle); - } - } - - Assert(list_length(tlist) > 0); - Assert(!rte->lateral); - - /* - * The relation's targetlist items are now in the appropriate form - * to insert into the query, except that we may need to wrap them - * in PlaceHolderVars. Set up required context data for - * pullup_replace_vars. - */ - rvcontext.root = root; - rvcontext.targetlist = tlist; - rvcontext.target_rte = rte; - rvcontext.result_relation = parse->resultRelation; - /* won't need these values */ - rvcontext.relids = NULL; - rvcontext.nullinfo = NULL; - /* pass NULL for outer_hasSubLinks */ - rvcontext.outer_hasSubLinks = NULL; - rvcontext.varno = rt_index; - /* this flag will be set below, if needed */ - rvcontext.wrap_option = REPLACE_WRAP_NONE; - /* initialize cache array with indexes 0 .. length(tlist) */ - rvcontext.rv_cache = palloc0((list_length(tlist) + 1) * - sizeof(Node *)); - - /* - * If the query uses grouping sets, we need a PlaceHolderVar for - * each expression of the relation's targetlist items. (See - * comments in pull_up_simple_subquery().) - */ - if (parse->groupingSets) - rvcontext.wrap_option = REPLACE_WRAP_ALL; - - /* - * Apply pullup variable replacement throughout the query tree. - */ - parse = (Query *) pullup_replace_vars((Node *) parse, &rvcontext); - } - - table_close(rel, NoLock); - } - - return parse; -} - /* * pull_up_subqueries * Look for subqueries in the rangetable that can be pulled up into @@ -1333,6 +1390,16 @@ pull_up_simple_subquery(PlannerInfo *root, Node *jtnode, RangeTblEntry *rte, /* No CTEs to worry about */ Assert(subquery->cteList == NIL); + /* + * Scan the rangetable for relation RTEs and retrieve the necessary + * catalog information for each relation. Using this information, clear + * the inh flag for any relation that has no children, collect not-null + * attribute numbers for any relation that has column not-null + * constraints, and expand virtual generated columns for any relation that + * contains them. + */ + subquery = subroot->parse = preprocess_relation_rtes(subroot); + /* * If the FROM clause is empty, replace it with a dummy RTE_RESULT RTE, so * that we don't need so many special cases to deal with that situation. @@ -1352,13 +1419,6 @@ pull_up_simple_subquery(PlannerInfo *root, Node *jtnode, RangeTblEntry *rte, */ preprocess_function_rtes(subroot); - /* - * Scan the rangetable for relations with virtual generated columns, and - * replace all Var nodes in the query that reference these columns with - * the generation expressions. - */ - subquery = subroot->parse = expand_virtual_generated_columns(subroot); - /* * Recursively pull up the subquery's subqueries, so that * pull_up_subqueries' processing is complete for its jointree and diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c index 26a3e0500866c..6f0b338d2cdf1 100644 --- a/src/backend/optimizer/util/clauses.c +++ b/src/backend/optimizer/util/clauses.c @@ -20,6 +20,7 @@ #include "postgres.h" #include "access/htup_details.h" +#include "catalog/pg_class.h" #include "catalog/pg_language.h" #include "catalog/pg_operator.h" #include "catalog/pg_proc.h" @@ -36,6 +37,7 @@ #include "optimizer/clauses.h" #include "optimizer/cost.h" #include "optimizer/optimizer.h" +#include "optimizer/pathnode.h" #include "optimizer/plancat.h" #include "optimizer/planmain.h" #include "parser/analyze.h" @@ -43,6 +45,7 @@ #include "parser/parse_collate.h" #include "parser/parse_func.h" #include "parser/parse_oper.h" +#include "parser/parsetree.h" #include "rewrite/rewriteHandler.h" #include "rewrite/rewriteManip.h" #include "tcop/tcopprot.h" @@ -2242,7 +2245,8 @@ rowtype_field_matches(Oid rowtypeid, int fieldnum, * only operators and functions that are reasonable to try to execute. * * NOTE: "root" can be passed as NULL if the caller never wants to do any - * Param substitutions nor receive info about inlined functions. + * Param substitutions nor receive info about inlined functions nor reduce + * NullTest for Vars to constant true or constant false. * * NOTE: the planner assumes that this will always flatten nested AND and * OR clauses into N-argument form. See comments in prepqual.c. @@ -3333,6 +3337,13 @@ eval_const_expressions_mutator(Node *node, -1, coalesceexpr->coalescecollid); + /* + * If there's exactly one surviving argument, we no longer + * need COALESCE at all: the result is that argument + */ + if (list_length(newargs) == 1) + return (Node *) linitial(newargs); + newcoalesce = makeNode(CoalesceExpr); newcoalesce->coalescetype = coalesceexpr->coalescetype; newcoalesce->coalescecollid = coalesceexpr->coalescecollid; @@ -3537,6 +3548,31 @@ eval_const_expressions_mutator(Node *node, return makeBoolConst(result, false); } + if (!ntest->argisrow && arg && IsA(arg, Var) && context->root) + { + Var *varg = (Var *) arg; + bool result; + + if (var_is_nonnullable(context->root, varg, false)) + { + switch (ntest->nulltesttype) + { + case IS_NULL: + result = false; + break; + case IS_NOT_NULL: + result = true; + break; + default: + elog(ERROR, "unrecognized nulltesttype: %d", + (int) ntest->nulltesttype); + result = false; /* keep compiler quiet */ + break; + } + + return makeBoolConst(result, false); + } + } newntest = makeNode(NullTest); newntest->arg = (Expr *) arg; @@ -4155,6 +4191,67 @@ simplify_function(Oid funcid, Oid result_type, int32 result_typmod, return newexpr; } +/* + * var_is_nonnullable: check to see if the Var cannot be NULL + * + * If the Var is defined NOT NULL and meanwhile is not nulled by any outer + * joins or grouping sets, then we can know that it cannot be NULL. + * + * use_rel_info indicates whether the corresponding RelOptInfo is available for + * use. + */ +bool +var_is_nonnullable(PlannerInfo *root, Var *var, bool use_rel_info) +{ + Relids notnullattnums = NULL; + + Assert(IsA(var, Var)); + + /* skip upper-level Vars */ + if (var->varlevelsup != 0) + return false; + + /* could the Var be nulled by any outer joins or grouping sets? */ + if (!bms_is_empty(var->varnullingrels)) + return false; + + /* system columns cannot be NULL */ + if (var->varattno < 0) + return true; + + /* + * Check if the Var is defined as NOT NULL. We retrieve the column NOT + * NULL constraint information from the corresponding RelOptInfo if it is + * available; otherwise, we search the hash table for this information. + */ + if (use_rel_info) + { + RelOptInfo *rel = find_base_rel(root, var->varno); + + notnullattnums = rel->notnullattnums; + } + else + { + RangeTblEntry *rte = planner_rt_fetch(var->varno, root); + + /* + * We must skip inheritance parent tables, as some child tables may + * have a NOT NULL constraint for a column while others may not. This + * cannot happen with partitioned tables, though. + */ + if (rte->inh && rte->relkind != RELKIND_PARTITIONED_TABLE) + return false; + + notnullattnums = find_relation_notnullatts(root, rte->relid); + } + + if (var->varattno > 0 && + bms_is_member(var->varattno, notnullattnums)) + return true; + + return false; +} + /* * expand_function_arguments: convert named-notation args to positional args * and/or insert default args, as needed diff --git a/src/backend/optimizer/util/inherit.c b/src/backend/optimizer/util/inherit.c index 17e51cd75d744..30d158069e332 100644 --- a/src/backend/optimizer/util/inherit.c +++ b/src/backend/optimizer/util/inherit.c @@ -466,8 +466,7 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, Index *childRTindex_p) { Query *parse = root->parse; - Oid parentOID PG_USED_FOR_ASSERTS_ONLY = - RelationGetRelid(parentrel); + Oid parentOID = RelationGetRelid(parentrel); Oid childOID = RelationGetRelid(childrel); RangeTblEntry *childrte; Index childRTindex; @@ -513,6 +512,13 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, *childrte_p = childrte; *childRTindex_p = childRTindex; + /* + * Retrieve column not-null constraint information for the child relation + * if its relation OID is different from the parent's. + */ + if (childOID != parentOID) + get_relation_notnullatts(root, childrel); + /* * Build an AppendRelInfo struct for each parent/child pair. */ diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index e0192d4a491d2..a4c5867cdcb84 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1404,12 +1404,12 @@ create_append_path(PlannerInfo *root, pathnode->path.total_cost = child->total_cost; } else - cost_append(pathnode); + cost_append(pathnode, root); /* Must do this last, else cost_append complains */ pathnode->path.pathkeys = child->pathkeys; } else - cost_append(pathnode); + cost_append(pathnode, root); /* If the caller provided a row estimate, override the computed value. */ if (rows >= 0) @@ -1515,6 +1515,9 @@ create_merge_append_path(PlannerInfo *root, foreach(l, subpaths) { Path *subpath = (Path *) lfirst(l); + int presorted_keys; + Path sort_path; /* dummy for result of + * cost_sort/cost_incremental_sort */ /* All child paths should be unparameterized */ Assert(bms_is_empty(PATH_REQ_OUTER(subpath))); @@ -1523,32 +1526,52 @@ create_merge_append_path(PlannerInfo *root, pathnode->path.parallel_safe = pathnode->path.parallel_safe && subpath->parallel_safe; - if (pathkeys_contained_in(pathkeys, subpath->pathkeys)) + if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys, + &presorted_keys)) { - /* Subpath is adequately ordered, we won't need to sort it */ - input_disabled_nodes += subpath->disabled_nodes; - input_startup_cost += subpath->startup_cost; - input_total_cost += subpath->total_cost; - } - else - { - /* We'll need to insert a Sort node, so include cost for that */ - Path sort_path; /* dummy for result of cost_sort */ + /* + * We'll need to insert a Sort node, so include costs for that. We + * choose to use incremental sort if it is enabled and there are + * presorted keys; otherwise we use full sort. + * + * We can use the parent's LIMIT if any, since we certainly won't + * pull more than that many tuples from any child. + */ + if (enable_incremental_sort && presorted_keys > 0) + { + cost_incremental_sort(&sort_path, + root, + pathkeys, + presorted_keys, + subpath->disabled_nodes, + subpath->startup_cost, + subpath->total_cost, + subpath->rows, + subpath->pathtarget->width, + 0.0, + work_mem, + pathnode->limit_tuples); + } + else + { + cost_sort(&sort_path, + root, + pathkeys, + subpath->disabled_nodes, + subpath->total_cost, + subpath->rows, + subpath->pathtarget->width, + 0.0, + work_mem, + pathnode->limit_tuples); + } - cost_sort(&sort_path, - root, - pathkeys, - subpath->disabled_nodes, - subpath->total_cost, - subpath->rows, - subpath->pathtarget->width, - 0.0, - work_mem, - pathnode->limit_tuples); - input_disabled_nodes += sort_path.disabled_nodes; - input_startup_cost += sort_path.startup_cost; - input_total_cost += sort_path.total_cost; + subpath = &sort_path; } + + input_disabled_nodes += subpath->disabled_nodes; + input_startup_cost += subpath->startup_cost; + input_total_cost += subpath->total_cost; } /* @@ -1666,7 +1689,7 @@ create_material_path(RelOptInfo *rel, Path *subpath) MemoizePath * create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, List *param_exprs, List *hash_operators, - bool singlerow, bool binary_mode, double calls) + bool singlerow, bool binary_mode, Cardinality est_calls) { MemoizePath *pathnode = makeNode(MemoizePath); @@ -1687,7 +1710,6 @@ create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, pathnode->param_exprs = param_exprs; pathnode->singlerow = singlerow; pathnode->binary_mode = binary_mode; - pathnode->calls = clamp_row_est(calls); /* * For now we set est_entries to 0. cost_memoize_rescan() does all the @@ -1697,6 +1719,12 @@ create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, */ pathnode->est_entries = 0; + pathnode->est_calls = clamp_row_est(est_calls); + + /* These will also be set later in cost_memoize_rescan() */ + pathnode->est_unique_keys = 0.0; + pathnode->est_hit_ratio = 0.0; + /* we should not generate this path type when enable_memoize=false */ Assert(enable_memoize); pathnode->path.disabled_nodes = subpath->disabled_nodes; @@ -4236,7 +4264,7 @@ reparameterize_path(PlannerInfo *root, Path *path, mpath->hash_operators, mpath->singlerow, mpath->binary_mode, - mpath->calls); + mpath->est_calls); } default: break; diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 59233b647302d..c6a58afc5e506 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -59,6 +59,12 @@ int constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION; /* Hook for plugins to get control in get_relation_info() */ get_relation_info_hook_type get_relation_info_hook = NULL; +typedef struct NotnullHashEntry +{ + Oid relid; /* OID of the relation */ + Relids notnullattnums; /* attnums of NOT NULL columns */ +} NotnullHashEntry; + static void get_relation_foreign_keys(PlannerInfo *root, RelOptInfo *rel, Relation relation, bool inhparent); @@ -172,27 +178,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, * RangeTblEntry does get populated. */ if (!inhparent || relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) - { - for (int i = 0; i < relation->rd_att->natts; i++) - { - CompactAttribute *attr = TupleDescCompactAttr(relation->rd_att, i); - - Assert(attr->attnullability != ATTNULLABLE_UNKNOWN); - - if (attr->attnullability == ATTNULLABLE_VALID) - { - rel->notnullattnums = bms_add_member(rel->notnullattnums, - i + 1); - - /* - * Per RemoveAttributeById(), dropped columns will have their - * attnotnull unset, so we needn't check for dropped columns - * in the above condition. - */ - Assert(!attr->attisdropped); - } - } - } + rel->notnullattnums = find_relation_notnullatts(root, relationObjectId); /* * Estimate relation size --- unless it's an inheritance parent, in which @@ -683,6 +669,105 @@ get_relation_foreign_keys(PlannerInfo *root, RelOptInfo *rel, } } +/* + * get_relation_notnullatts - + * Retrieves column not-null constraint information for a given relation. + * + * We do this while we have the relcache entry open, and store the column + * not-null constraint information in a hash table based on the relation OID. + */ +void +get_relation_notnullatts(PlannerInfo *root, Relation relation) +{ + Oid relid = RelationGetRelid(relation); + NotnullHashEntry *hentry; + bool found; + Relids notnullattnums = NULL; + + /* bail out if the relation has no not-null constraints */ + if (relation->rd_att->constr == NULL || + !relation->rd_att->constr->has_not_null) + return; + + /* create the hash table if it hasn't been created yet */ + if (root->glob->rel_notnullatts_hash == NULL) + { + HTAB *hashtab; + HASHCTL hash_ctl; + + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(NotnullHashEntry); + hash_ctl.hcxt = CurrentMemoryContext; + + hashtab = hash_create("Relation NOT NULL attnums", + 64L, /* arbitrary initial size */ + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + root->glob->rel_notnullatts_hash = hashtab; + } + + /* + * Create a hash entry for this relation OID, if we don't have one + * already. + */ + hentry = (NotnullHashEntry *) hash_search(root->glob->rel_notnullatts_hash, + &relid, + HASH_ENTER, + &found); + + /* bail out if a hash entry already exists for this relation OID */ + if (found) + return; + + /* collect the column not-null constraint information for this relation */ + for (int i = 0; i < relation->rd_att->natts; i++) + { + CompactAttribute *attr = TupleDescCompactAttr(relation->rd_att, i); + + Assert(attr->attnullability != ATTNULLABLE_UNKNOWN); + + if (attr->attnullability == ATTNULLABLE_VALID) + { + notnullattnums = bms_add_member(notnullattnums, i + 1); + + /* + * Per RemoveAttributeById(), dropped columns will have their + * attnotnull unset, so we needn't check for dropped columns in + * the above condition. + */ + Assert(!attr->attisdropped); + } + } + + /* ... and initialize the new hash entry */ + hentry->notnullattnums = notnullattnums; +} + +/* + * find_relation_notnullatts - + * Searches the hash table and returns the column not-null constraint + * information for a given relation. + */ +Relids +find_relation_notnullatts(PlannerInfo *root, Oid relid) +{ + NotnullHashEntry *hentry; + bool found; + + if (root->glob->rel_notnullatts_hash == NULL) + return NULL; + + hentry = (NotnullHashEntry *) hash_search(root->glob->rel_notnullatts_hash, + &relid, + HASH_FIND, + &found); + if (!found) + return NULL; + + return hentry->notnullattnums; +} + /* * infer_arbiter_indexes - * Determine the unique indexes used to arbitrate speculative insertion. diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 50f53159d5819..db43034b9db57 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -318,6 +318,11 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type opt_qualified_name %type opt_concurrently %type opt_drop_behavior +%type opt_utility_option_list +%type utility_option_list +%type utility_option_elem +%type utility_option_name +%type utility_option_arg %type alter_column_default opclass_item opclass_drop alter_using %type add_drop opt_asc_desc opt_nulls_order @@ -338,10 +343,6 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); create_extension_opt_item alter_extension_opt_item %type opt_lock lock_type cast_context -%type utility_option_name -%type utility_option_elem -%type utility_option_list -%type utility_option_arg %type drop_option %type opt_or_replace opt_no opt_grant_grant_option @@ -556,7 +557,6 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type generic_option_list alter_generic_option_list %type reindex_target_relation reindex_target_all -%type opt_reindex_option_list %type copy_generic_opt_arg copy_generic_opt_arg_list_item %type copy_generic_opt_elem @@ -1141,6 +1141,41 @@ opt_drop_behavior: | /* EMPTY */ { $$ = DROP_RESTRICT; /* default */ } ; +opt_utility_option_list: + '(' utility_option_list ')' { $$ = $2; } + | /* EMPTY */ { $$ = NULL; } + ; + +utility_option_list: + utility_option_elem + { + $$ = list_make1($1); + } + | utility_option_list ',' utility_option_elem + { + $$ = lappend($1, $3); + } + ; + +utility_option_elem: + utility_option_name utility_option_arg + { + $$ = makeDefElem($1, $2, @1); + } + ; + +utility_option_name: + NonReservedWord { $$ = $1; } + | analyze_keyword { $$ = "analyze"; } + | FORMAT_LA { $$ = "format"; } + ; + +utility_option_arg: + opt_boolean_or_string { $$ = (Node *) makeString($1); } + | NumericOnly { $$ = (Node *) $1; } + | /* EMPTY */ { $$ = NULL; } + ; + /***************************************************************************** * * CALL statement @@ -2028,11 +2063,12 @@ constraints_set_mode: * Checkpoint statement */ CheckPointStmt: - CHECKPOINT + CHECKPOINT opt_utility_option_list { CheckPointStmt *n = makeNode(CheckPointStmt); $$ = (Node *) n; + n->options = $2; } ; @@ -2668,6 +2704,12 @@ alter_table_cmd: c->alterDeferrability = true; if ($4 & CAS_NO_INHERIT) c->alterInheritability = true; + /* handle unsupported case with specific error message */ + if ($4 & CAS_NOT_VALID) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("constraints cannot be altered to be NOT VALID"), + parser_errposition(@4)); processCASbits($4, @4, "FOREIGN KEY", &c->deferrable, &c->initdeferred, @@ -6035,6 +6077,26 @@ CreateTrigStmt: EXECUTE FUNCTION_or_PROCEDURE func_name '(' TriggerFuncArgs ')' { CreateTrigStmt *n = makeNode(CreateTrigStmt); + bool dummy; + + if (($11 & CAS_NOT_VALID) != 0) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("constraint triggers cannot be marked %s", + "NOT VALID"), + parser_errposition(@11)); + if (($11 & CAS_NO_INHERIT) != 0) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("constraint triggers cannot be marked %s", + "NO INHERIT"), + parser_errposition(@11)); + if (($11 & CAS_NOT_ENFORCED) != 0) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("constraint triggers cannot be marked %s", + "NOT ENFORCED"), + parser_errposition(@11)); n->replace = $2; if (n->replace) /* not supported, see CreateTrigger */ @@ -6054,7 +6116,7 @@ CreateTrigStmt: n->whenClause = $15; n->transitionRels = NIL; processCASbits($11, @11, "TRIGGER", - &n->deferrable, &n->initdeferred, NULL, + &n->deferrable, &n->initdeferred, &dummy, NULL, NULL, yyscanner); n->constrrel = $10; $$ = (Node *) n; @@ -7477,6 +7539,8 @@ fetch_args: cursor_name n->portalname = $1; n->direction = FETCH_FORWARD; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_NONE; $$ = (Node *) n; } | from_in cursor_name @@ -7486,6 +7550,19 @@ fetch_args: cursor_name n->portalname = $2; n->direction = FETCH_FORWARD; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_NONE; + $$ = (Node *) n; + } + | SignedIconst opt_from_in cursor_name + { + FetchStmt *n = makeNode(FetchStmt); + + n->portalname = $3; + n->direction = FETCH_FORWARD; + n->howMany = $1; + n->location = @1; + n->direction_keyword = FETCH_KEYWORD_NONE; $$ = (Node *) n; } | NEXT opt_from_in cursor_name @@ -7495,6 +7572,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_FORWARD; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_NEXT; $$ = (Node *) n; } | PRIOR opt_from_in cursor_name @@ -7504,6 +7583,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_BACKWARD; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_PRIOR; $$ = (Node *) n; } | FIRST_P opt_from_in cursor_name @@ -7513,6 +7594,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_ABSOLUTE; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_FIRST; $$ = (Node *) n; } | LAST_P opt_from_in cursor_name @@ -7522,6 +7605,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_ABSOLUTE; n->howMany = -1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_LAST; $$ = (Node *) n; } | ABSOLUTE_P SignedIconst opt_from_in cursor_name @@ -7531,6 +7616,8 @@ fetch_args: cursor_name n->portalname = $4; n->direction = FETCH_ABSOLUTE; n->howMany = $2; + n->location = @2; + n->direction_keyword = FETCH_KEYWORD_ABSOLUTE; $$ = (Node *) n; } | RELATIVE_P SignedIconst opt_from_in cursor_name @@ -7540,15 +7627,8 @@ fetch_args: cursor_name n->portalname = $4; n->direction = FETCH_RELATIVE; n->howMany = $2; - $$ = (Node *) n; - } - | SignedIconst opt_from_in cursor_name - { - FetchStmt *n = makeNode(FetchStmt); - - n->portalname = $3; - n->direction = FETCH_FORWARD; - n->howMany = $1; + n->location = @2; + n->direction_keyword = FETCH_KEYWORD_RELATIVE; $$ = (Node *) n; } | ALL opt_from_in cursor_name @@ -7558,6 +7638,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_FORWARD; n->howMany = FETCH_ALL; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_ALL; $$ = (Node *) n; } | FORWARD opt_from_in cursor_name @@ -7567,6 +7649,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_FORWARD; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_FORWARD; $$ = (Node *) n; } | FORWARD SignedIconst opt_from_in cursor_name @@ -7576,6 +7660,8 @@ fetch_args: cursor_name n->portalname = $4; n->direction = FETCH_FORWARD; n->howMany = $2; + n->location = @2; + n->direction_keyword = FETCH_KEYWORD_FORWARD; $$ = (Node *) n; } | FORWARD ALL opt_from_in cursor_name @@ -7585,6 +7671,8 @@ fetch_args: cursor_name n->portalname = $4; n->direction = FETCH_FORWARD; n->howMany = FETCH_ALL; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_FORWARD_ALL; $$ = (Node *) n; } | BACKWARD opt_from_in cursor_name @@ -7594,6 +7682,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_BACKWARD; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_BACKWARD; $$ = (Node *) n; } | BACKWARD SignedIconst opt_from_in cursor_name @@ -7603,6 +7693,8 @@ fetch_args: cursor_name n->portalname = $4; n->direction = FETCH_BACKWARD; n->howMany = $2; + n->location = @2; + n->direction_keyword = FETCH_KEYWORD_BACKWARD; $$ = (Node *) n; } | BACKWARD ALL opt_from_in cursor_name @@ -7612,6 +7704,8 @@ fetch_args: cursor_name n->portalname = $4; n->direction = FETCH_BACKWARD; n->howMany = FETCH_ALL; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_BACKWARD_ALL; $$ = (Node *) n; } ; @@ -9289,7 +9383,7 @@ DropTransformStmt: DROP TRANSFORM opt_if_exists FOR Typename LANGUAGE name opt_d *****************************************************************************/ ReindexStmt: - REINDEX opt_reindex_option_list reindex_target_relation opt_concurrently qualified_name + REINDEX opt_utility_option_list reindex_target_relation opt_concurrently qualified_name { ReindexStmt *n = makeNode(ReindexStmt); @@ -9302,7 +9396,7 @@ ReindexStmt: makeDefElem("concurrently", NULL, @4)); $$ = (Node *) n; } - | REINDEX opt_reindex_option_list SCHEMA opt_concurrently name + | REINDEX opt_utility_option_list SCHEMA opt_concurrently name { ReindexStmt *n = makeNode(ReindexStmt); @@ -9315,7 +9409,7 @@ ReindexStmt: makeDefElem("concurrently", NULL, @4)); $$ = (Node *) n; } - | REINDEX opt_reindex_option_list reindex_target_all opt_concurrently opt_single_name + | REINDEX opt_utility_option_list reindex_target_all opt_concurrently opt_single_name { ReindexStmt *n = makeNode(ReindexStmt); @@ -9337,10 +9431,6 @@ reindex_target_all: SYSTEM_P { $$ = REINDEX_OBJECT_SYSTEM; } | DATABASE { $$ = REINDEX_OBJECT_DATABASE; } ; -opt_reindex_option_list: - '(' utility_option_list ')' { $$ = $2; } - | /* EMPTY */ { $$ = NULL; } - ; /***************************************************************************** * @@ -11627,7 +11717,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'T'; + n->subtype = AD_AlterDefault; n->typeName = $3; n->def = $4; $$ = (Node *) n; @@ -11637,7 +11727,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'N'; + n->subtype = AD_DropNotNull; n->typeName = $3; $$ = (Node *) n; } @@ -11646,7 +11736,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'O'; + n->subtype = AD_SetNotNull; n->typeName = $3; $$ = (Node *) n; } @@ -11655,7 +11745,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'C'; + n->subtype = AD_AddConstraint; n->typeName = $3; n->def = $5; $$ = (Node *) n; @@ -11665,7 +11755,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'X'; + n->subtype = AD_DropConstraint; n->typeName = $3; n->name = $6; n->behavior = $7; @@ -11677,7 +11767,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'X'; + n->subtype = AD_DropConstraint; n->typeName = $3; n->name = $8; n->behavior = $9; @@ -11689,7 +11779,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'V'; + n->subtype = AD_ValidateConstraint; n->typeName = $3; n->name = $6; $$ = (Node *) n; @@ -11838,13 +11928,13 @@ ClusterStmt: n->params = $3; $$ = (Node *) n; } - | CLUSTER '(' utility_option_list ')' + | CLUSTER opt_utility_option_list { ClusterStmt *n = makeNode(ClusterStmt); n->relation = NULL; n->indexname = NULL; - n->params = $3; + n->params = $2; $$ = (Node *) n; } /* unparenthesized VERBOSE kept for pre-14 compatibility */ @@ -11854,21 +11944,18 @@ ClusterStmt: n->relation = $3; n->indexname = $4; - n->params = NIL; if ($2) - n->params = lappend(n->params, makeDefElem("verbose", NULL, @2)); + n->params = list_make1(makeDefElem("verbose", NULL, @2)); $$ = (Node *) n; } /* unparenthesized VERBOSE kept for pre-17 compatibility */ - | CLUSTER opt_verbose + | CLUSTER VERBOSE { ClusterStmt *n = makeNode(ClusterStmt); n->relation = NULL; n->indexname = NULL; - n->params = NIL; - if ($2) - n->params = lappend(n->params, makeDefElem("verbose", NULL, @2)); + n->params = list_make1(makeDefElem("verbose", NULL, @2)); $$ = (Node *) n; } /* kept for pre-8.3 compatibility */ @@ -11878,9 +11965,8 @@ ClusterStmt: n->relation = $5; n->indexname = $3; - n->params = NIL; if ($2) - n->params = lappend(n->params, makeDefElem("verbose", NULL, @2)); + n->params = list_make1(makeDefElem("verbose", NULL, @2)); $$ = (Node *) n; } ; @@ -11931,64 +12017,31 @@ VacuumStmt: VACUUM opt_full opt_freeze opt_verbose opt_analyze opt_vacuum_relati } ; -AnalyzeStmt: analyze_keyword opt_verbose opt_vacuum_relation_list +AnalyzeStmt: analyze_keyword opt_utility_option_list opt_vacuum_relation_list { VacuumStmt *n = makeNode(VacuumStmt); - n->options = NIL; - if ($2) - n->options = lappend(n->options, - makeDefElem("verbose", NULL, @2)); + n->options = $2; n->rels = $3; n->is_vacuumcmd = false; $$ = (Node *) n; } - | analyze_keyword '(' utility_option_list ')' opt_vacuum_relation_list + | analyze_keyword VERBOSE opt_vacuum_relation_list { VacuumStmt *n = makeNode(VacuumStmt); - n->options = $3; - n->rels = $5; + n->options = list_make1(makeDefElem("verbose", NULL, @2)); + n->rels = $3; n->is_vacuumcmd = false; $$ = (Node *) n; } ; -utility_option_list: - utility_option_elem - { - $$ = list_make1($1); - } - | utility_option_list ',' utility_option_elem - { - $$ = lappend($1, $3); - } - ; - analyze_keyword: ANALYZE | ANALYSE /* British */ ; -utility_option_elem: - utility_option_name utility_option_arg - { - $$ = makeDefElem($1, $2, @1); - } - ; - -utility_option_name: - NonReservedWord { $$ = $1; } - | analyze_keyword { $$ = "analyze"; } - | FORMAT_LA { $$ = "format"; } - ; - -utility_option_arg: - opt_boolean_or_string { $$ = (Node *) makeString($1); } - | NumericOnly { $$ = (Node *) $1; } - | /* EMPTY */ { $$ = NULL; } - ; - opt_analyze: analyze_keyword { $$ = true; } | /*EMPTY*/ { $$ = false; } diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 451fb90a610a7..9474095f271a1 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -3190,7 +3190,7 @@ autovacuum_do_vac_analyze(autovac_table *tab, BufferAccessStrategy bstrategy) rel_list = list_make1(rel); MemoryContextSwitchTo(old_context); - vacuum(rel_list, &tab->at_params, bstrategy, vac_context, true); + vacuum(rel_list, tab->at_params, bstrategy, vac_context, true); MemoryContextDelete(vac_context); } diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index 116ddf7b835f1..1ad65c237c34e 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -613,6 +613,7 @@ ResetBackgroundWorkerCrashTimes(void) * resetting. */ rw->rw_crashed_at = 0; + rw->rw_pid = 0; /* * If there was anyone waiting for it, they're history. diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index fda91ffd1ce2d..8490148a47d52 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -42,6 +42,8 @@ #include "access/xlog.h" #include "access/xlog_internal.h" #include "access/xlogrecovery.h" +#include "catalog/pg_authid.h" +#include "commands/defrem.h" #include "libpq/pqsignal.h" #include "miscadmin.h" #include "pgstat.h" @@ -61,6 +63,7 @@ #include "storage/shmem.h" #include "storage/smgr.h" #include "storage/spin.h" +#include "utils/acl.h" #include "utils/guc.h" #include "utils/memutils.h" #include "utils/resowner.h" @@ -127,6 +130,13 @@ typedef struct int num_requests; /* current # of requests */ int max_requests; /* allocated array size */ + + int head; /* Index of the first request in the ring + * buffer */ + int tail; /* Index of the last request in the ring + * buffer */ + + /* The ring buffer of pending checkpointer requests */ CheckpointerRequest requests[FLEXIBLE_ARRAY_MEMBER]; } CheckpointerShmemStruct; @@ -135,6 +145,12 @@ static CheckpointerShmemStruct *CheckpointerShmem; /* interval for calling AbsorbSyncRequests in CheckpointWriteDelay */ #define WRITES_PER_ABSORB 1000 +/* Maximum number of checkpointer requests to process in one batch */ +#define CKPT_REQ_BATCH_SIZE 10000 + +/* Max number of requests the checkpointer request queue can hold */ +#define MAX_CHECKPOINT_REQUESTS 10000000 + /* * GUC parameters */ @@ -161,7 +177,7 @@ static pg_time_t last_xlog_switch_time; static void ProcessCheckpointerInterrupts(void); static void CheckArchiveTimeout(void); static bool IsCheckpointOnSchedule(double progress); -static bool ImmediateCheckpointRequested(void); +static bool FastCheckpointRequested(void); static bool CompactCheckpointerRequestQueue(void); static void UpdateSharedMemoryConfig(void); @@ -734,12 +750,12 @@ CheckArchiveTimeout(void) } /* - * Returns true if an immediate checkpoint request is pending. (Note that - * this does not check the *current* checkpoint's IMMEDIATE flag, but whether - * there is one pending behind it.) + * Returns true if a fast checkpoint request is pending. (Note that this does + * not check the *current* checkpoint's FAST flag, but whether there is one + * pending behind it.) */ static bool -ImmediateCheckpointRequested(void) +FastCheckpointRequested(void) { volatile CheckpointerShmemStruct *cps = CheckpointerShmem; @@ -747,7 +763,7 @@ ImmediateCheckpointRequested(void) * We don't need to acquire the ckpt_lck in this case because we're only * looking at a single flag bit. */ - if (cps->ckpt_flags & CHECKPOINT_IMMEDIATE) + if (cps->ckpt_flags & CHECKPOINT_FAST) return true; return false; } @@ -760,7 +776,7 @@ ImmediateCheckpointRequested(void) * checkpoint_completion_target. * * The checkpoint request flags should be passed in; currently the only one - * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes. + * examined is CHECKPOINT_FAST, which disables delays between writes. * * 'progress' is an estimate of how much of the work has been done, as a * fraction between 0.0 meaning none, and 1.0 meaning all done. @@ -778,10 +794,10 @@ CheckpointWriteDelay(int flags, double progress) * Perform the usual duties and take a nap, unless we're behind schedule, * in which case we just try to catch up as quickly as possible. */ - if (!(flags & CHECKPOINT_IMMEDIATE) && + if (!(flags & CHECKPOINT_FAST) && !ShutdownXLOGPending && !ShutdownRequestPending && - !ImmediateCheckpointRequested() && + !FastCheckpointRequested() && IsCheckpointOnSchedule(progress)) { if (ConfigReloadPending) @@ -970,12 +986,63 @@ CheckpointerShmemInit(void) */ MemSet(CheckpointerShmem, 0, size); SpinLockInit(&CheckpointerShmem->ckpt_lck); - CheckpointerShmem->max_requests = NBuffers; + CheckpointerShmem->max_requests = Min(NBuffers, MAX_CHECKPOINT_REQUESTS); + CheckpointerShmem->head = CheckpointerShmem->tail = 0; ConditionVariableInit(&CheckpointerShmem->start_cv); ConditionVariableInit(&CheckpointerShmem->done_cv); } } +/* + * ExecCheckpoint + * Primary entry point for manual CHECKPOINT commands + * + * This is mainly a wrapper for RequestCheckpoint(). + */ +void +ExecCheckpoint(ParseState *pstate, CheckPointStmt *stmt) +{ + bool fast = true; + bool unlogged = false; + + foreach_ptr(DefElem, opt, stmt->options) + { + if (strcmp(opt->defname, "mode") == 0) + { + char *mode = defGetString(opt); + + if (strcmp(mode, "spread") == 0) + fast = false; + else if (strcmp(mode, "fast") != 0) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("unrecognized MODE option \"%s\"", mode), + parser_errposition(pstate, opt->location))); + } + else if (strcmp(opt->defname, "flush_unlogged") == 0) + unlogged = defGetBoolean(opt); + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("unrecognized CHECKPOINT option \"%s\"", opt->defname), + parser_errposition(pstate, opt->location))); + } + + if (!has_privs_of_role(GetUserId(), ROLE_PG_CHECKPOINT)) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + /* translator: %s is name of an SQL command (e.g., CHECKPOINT) */ + errmsg("permission denied to execute %s command", + "CHECKPOINT"), + errdetail("Only roles with privileges of the \"%s\" role may execute this command.", + "pg_checkpoint"))); + + RequestCheckpoint(CHECKPOINT_WAIT | + (fast ? CHECKPOINT_FAST : 0) | + (unlogged ? CHECKPOINT_FLUSH_UNLOGGED : 0) | + (RecoveryInProgress() ? 0 : CHECKPOINT_FORCE)); +} + /* * RequestCheckpoint * Called in backend processes to request a checkpoint @@ -983,11 +1050,11 @@ CheckpointerShmemInit(void) * flags is a bitwise OR of the following: * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. - * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, + * CHECKPOINT_FAST: finish the checkpoint ASAP, * ignoring checkpoint_completion_target parameter. * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or - * CHECKPOINT_END_OF_RECOVERY). + * CHECKPOINT_END_OF_RECOVERY, and the CHECKPOINT command). * CHECKPOINT_WAIT: wait for completion before returning (otherwise, * just signal checkpointer to do it, and return). * CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling. @@ -1009,7 +1076,7 @@ RequestCheckpoint(int flags) * There's no point in doing slow checkpoints in a standalone backend, * because there's no other backends the checkpoint could disrupt. */ - CreateCheckPoint(flags | CHECKPOINT_IMMEDIATE); + CreateCheckPoint(flags | CHECKPOINT_FAST); /* Free all smgr objects, as CheckpointerMain() normally would. */ smgrdestroyall(); @@ -1148,6 +1215,7 @@ ForwardSyncRequest(const FileTag *ftag, SyncRequestType type) { CheckpointerRequest *request; bool too_full; + int insert_pos; if (!IsUnderPostmaster) return false; /* probably shouldn't even get here */ @@ -1171,10 +1239,14 @@ ForwardSyncRequest(const FileTag *ftag, SyncRequestType type) } /* OK, insert request */ - request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++]; + insert_pos = CheckpointerShmem->tail; + request = &CheckpointerShmem->requests[insert_pos]; request->ftag = *ftag; request->type = type; + CheckpointerShmem->tail = (CheckpointerShmem->tail + 1) % CheckpointerShmem->max_requests; + CheckpointerShmem->num_requests++; + /* If queue is more than half full, nudge the checkpointer to empty it */ too_full = (CheckpointerShmem->num_requests >= CheckpointerShmem->max_requests / 2); @@ -1216,12 +1288,16 @@ CompactCheckpointerRequestQueue(void) struct CheckpointerSlotMapping { CheckpointerRequest request; - int slot; + int ring_idx; }; - int n, - preserve_count; + int n; int num_skipped = 0; + int head; + int max_requests; + int num_requests; + int read_idx, + write_idx; HASHCTL ctl; HTAB *htab; bool *skip_slot; @@ -1233,8 +1309,13 @@ CompactCheckpointerRequestQueue(void) if (CritSectionCount > 0) return false; + max_requests = CheckpointerShmem->max_requests; + num_requests = CheckpointerShmem->num_requests; + /* Initialize skip_slot array */ - skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests); + skip_slot = palloc0(sizeof(bool) * max_requests); + + head = CheckpointerShmem->head; /* Initialize temporary hash table */ ctl.keysize = sizeof(CheckpointerRequest); @@ -1258,7 +1339,8 @@ CompactCheckpointerRequestQueue(void) * away preceding entries that would end up being canceled anyhow), but * it's not clear that the extra complexity would buy us anything. */ - for (n = 0; n < CheckpointerShmem->num_requests; n++) + read_idx = head; + for (n = 0; n < num_requests; n++) { CheckpointerRequest *request; struct CheckpointerSlotMapping *slotmap; @@ -1271,16 +1353,19 @@ CompactCheckpointerRequestQueue(void) * CheckpointerShmemInit. Note also that RelFileLocator had better * contain no pad bytes. */ - request = &CheckpointerShmem->requests[n]; + request = &CheckpointerShmem->requests[read_idx]; slotmap = hash_search(htab, request, HASH_ENTER, &found); if (found) { /* Duplicate, so mark the previous occurrence as skippable */ - skip_slot[slotmap->slot] = true; + skip_slot[slotmap->ring_idx] = true; num_skipped++; } /* Remember slot containing latest occurrence of this request value */ - slotmap->slot = n; + slotmap->ring_idx = read_idx; + + /* Move to the next request in the ring buffer */ + read_idx = (read_idx + 1) % max_requests; } /* Done with the hash table. */ @@ -1294,17 +1379,34 @@ CompactCheckpointerRequestQueue(void) } /* We found some duplicates; remove them. */ - preserve_count = 0; - for (n = 0; n < CheckpointerShmem->num_requests; n++) + read_idx = write_idx = head; + for (n = 0; n < num_requests; n++) { - if (skip_slot[n]) - continue; - CheckpointerShmem->requests[preserve_count++] = CheckpointerShmem->requests[n]; + /* If this slot is NOT skipped, keep it */ + if (!skip_slot[read_idx]) + { + /* If the read and write positions are different, copy the request */ + if (write_idx != read_idx) + CheckpointerShmem->requests[write_idx] = + CheckpointerShmem->requests[read_idx]; + + /* Advance the write position */ + write_idx = (write_idx + 1) % max_requests; + } + + read_idx = (read_idx + 1) % max_requests; } + + /* + * Update ring buffer state: head remains the same, tail moves, count + * decreases + */ + CheckpointerShmem->tail = write_idx; + CheckpointerShmem->num_requests -= num_skipped; + ereport(DEBUG1, (errmsg_internal("compacted fsync request queue from %d entries to %d entries", - CheckpointerShmem->num_requests, preserve_count))); - CheckpointerShmem->num_requests = preserve_count; + num_requests, CheckpointerShmem->num_requests))); /* Cleanup. */ pfree(skip_slot); @@ -1325,40 +1427,64 @@ AbsorbSyncRequests(void) { CheckpointerRequest *requests = NULL; CheckpointerRequest *request; - int n; + int n, + i; + bool loop; if (!AmCheckpointerProcess()) return; - LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); - - /* - * We try to avoid holding the lock for a long time by copying the request - * array, and processing the requests after releasing the lock. - * - * Once we have cleared the requests from shared memory, we have to PANIC - * if we then fail to absorb them (eg, because our hashtable runs out of - * memory). This is because the system cannot run safely if we are unable - * to fsync what we have been told to fsync. Fortunately, the hashtable - * is so small that the problem is quite unlikely to arise in practice. - */ - n = CheckpointerShmem->num_requests; - if (n > 0) + do { - requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest)); - memcpy(requests, CheckpointerShmem->requests, n * sizeof(CheckpointerRequest)); - } + LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); + + /*--- + * We try to avoid holding the lock for a long time by: + * 1. Copying the request array and processing the requests after + * releasing the lock; + * 2. Processing not the whole queue, but only batches of + * CKPT_REQ_BATCH_SIZE at once. + * + * Once we have cleared the requests from shared memory, we must + * PANIC if we then fail to absorb them (e.g., because our hashtable + * runs out of memory). This is because the system cannot run safely + * if we are unable to fsync what we have been told to fsync. + * Fortunately, the hashtable is so small that the problem is quite + * unlikely to arise in practice. + * + * Note: The maximum possible size of a ring buffer is + * MAX_CHECKPOINT_REQUESTS entries, which fit into a maximum palloc + * allocation size of 1Gb. Our maximum batch size, + * CKPT_REQ_BATCH_SIZE, is even smaller. + */ + n = Min(CheckpointerShmem->num_requests, CKPT_REQ_BATCH_SIZE); + if (n > 0) + { + if (!requests) + requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest)); - START_CRIT_SECTION(); + for (i = 0; i < n; i++) + { + requests[i] = CheckpointerShmem->requests[CheckpointerShmem->head]; + CheckpointerShmem->head = (CheckpointerShmem->head + 1) % CheckpointerShmem->max_requests; + } - CheckpointerShmem->num_requests = 0; + CheckpointerShmem->num_requests -= n; - LWLockRelease(CheckpointerCommLock); + } + + START_CRIT_SECTION(); + + /* Are there any requests in the queue? If so, keep going. */ + loop = CheckpointerShmem->num_requests != 0; + + LWLockRelease(CheckpointerCommLock); - for (request = requests; n > 0; request++, n--) - RememberSyncRequest(&request->ftag, request->type); + for (request = requests; n > 0; request++, n--) + RememberSyncRequest(&request->ftag, request->type); - END_CRIT_SECTION(); + END_CRIT_SECTION(); + } while (loop); if (requests) pfree(requests); diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 7e622ae4bd2a7..78e39e5f866a7 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -718,15 +718,15 @@ pgarch_readyXlog(char *xlog) /* * Store the file in our max-heap if it has a high enough priority. */ - if (arch_files->arch_heap->bh_size < NUM_FILES_PER_DIRECTORY_SCAN) + if (binaryheap_size(arch_files->arch_heap) < NUM_FILES_PER_DIRECTORY_SCAN) { /* If the heap isn't full yet, quickly add it. */ - arch_file = arch_files->arch_filenames[arch_files->arch_heap->bh_size]; + arch_file = arch_files->arch_filenames[binaryheap_size(arch_files->arch_heap)]; strcpy(arch_file, basename); binaryheap_add_unordered(arch_files->arch_heap, CStringGetDatum(arch_file)); /* If we just filled the heap, make it a valid one. */ - if (arch_files->arch_heap->bh_size == NUM_FILES_PER_DIRECTORY_SCAN) + if (binaryheap_size(arch_files->arch_heap) == NUM_FILES_PER_DIRECTORY_SCAN) binaryheap_build(arch_files->arch_heap); } else if (ready_file_comparator(binaryheap_first(arch_files->arch_heap), @@ -744,21 +744,21 @@ pgarch_readyXlog(char *xlog) FreeDir(rldir); /* If no files were found, simply return. */ - if (arch_files->arch_heap->bh_size == 0) + if (binaryheap_empty(arch_files->arch_heap)) return false; /* * If we didn't fill the heap, we didn't make it a valid one. Do that * now. */ - if (arch_files->arch_heap->bh_size < NUM_FILES_PER_DIRECTORY_SCAN) + if (binaryheap_size(arch_files->arch_heap) < NUM_FILES_PER_DIRECTORY_SCAN) binaryheap_build(arch_files->arch_heap); /* * Fill arch_files array with the files to archive in ascending order of * priority. */ - arch_files->arch_files_size = arch_files->arch_heap->bh_size; + arch_files->arch_files_size = binaryheap_size(arch_files->arch_heap); for (int i = 0; i < arch_files->arch_files_size; i++) arch_files->arch_files[i] = DatumGetCString(binaryheap_remove_first(arch_files->arch_heap)); diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 490f7ce36645b..e01d9f0cfe81e 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -2630,6 +2630,13 @@ CleanupBackend(PMChild *bp, } bp = NULL; + /* + * In a crash case, exit immediately without resetting background worker + * state. However, if restart_after_crash is enabled, the background + * worker state (e.g., rw_pid) still needs be reset so the worker can + * restart after crash recovery. This reset is handled in + * ResetBackgroundWorkerCrashTimes(), not here. + */ if (crashed) { HandleChildCrash(bp_pid, exitstatus, procname); @@ -4337,15 +4344,15 @@ maybe_start_bgworkers(void) static bool maybe_reap_io_worker(int pid) { - for (int id = 0; id < MAX_IO_WORKERS; ++id) + for (int i = 0; i < MAX_IO_WORKERS; ++i) { - if (io_worker_children[id] && - io_worker_children[id]->pid == pid) + if (io_worker_children[i] && + io_worker_children[i]->pid == pid) { - ReleasePostmasterChildSlot(io_worker_children[id]); + ReleasePostmasterChildSlot(io_worker_children[i]); --io_worker_count; - io_worker_children[id] = NULL; + io_worker_children[i] = NULL; return true; } } @@ -4389,22 +4396,22 @@ maybe_adjust_io_workers(void) while (io_worker_count < io_workers) { PMChild *child; - int id; + int i; /* find unused entry in io_worker_children array */ - for (id = 0; id < MAX_IO_WORKERS; ++id) + for (i = 0; i < MAX_IO_WORKERS; ++i) { - if (io_worker_children[id] == NULL) + if (io_worker_children[i] == NULL) break; } - if (id == MAX_IO_WORKERS) - elog(ERROR, "could not find a free IO worker ID"); + if (i == MAX_IO_WORKERS) + elog(ERROR, "could not find a free IO worker slot"); /* Try to launch one. */ child = StartChildProcess(B_IO_WORKER); if (child != NULL) { - io_worker_children[id] = child; + io_worker_children[i] = child; ++io_worker_count; } else @@ -4415,11 +4422,11 @@ maybe_adjust_io_workers(void) if (io_worker_count > io_workers) { /* ask the IO worker in the highest slot to exit */ - for (int id = MAX_IO_WORKERS - 1; id >= 0; --id) + for (int i = MAX_IO_WORKERS - 1; i >= 0; --i) { - if (io_worker_children[id] != NULL) + if (io_worker_children[i] != NULL) { - kill(io_worker_children[id]->pid, SIGUSR2); + kill(io_worker_children[i]->pid, SIGUSR2); break; } } diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c index 0fec4f1f871ce..777c9a8d5553b 100644 --- a/src/backend/postmaster/walsummarizer.c +++ b/src/backend/postmaster/walsummarizer.c @@ -385,7 +385,7 @@ WalSummarizerMain(const void *startup_data, size_t startup_data_len) switch_lsn = tliSwitchPoint(current_tli, tles, &switch_tli); ereport(DEBUG1, - errmsg_internal("switch point from TLI %u to TLI %u is at %X/%X", + errmsg_internal("switch point from TLI %u to TLI %u is at %X/%08X", current_tli, switch_tli, LSN_FORMAT_ARGS(switch_lsn))); } @@ -741,7 +741,7 @@ WaitForWalSummarization(XLogRecPtr lsn) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("WAL summarization is not progressing"), - errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.", + errdetail("Summarization is needed through %X/%08X, but is stuck at %X/%08X on disk and %X/%08X in memory.", LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(summarized_lsn), LSN_FORMAT_ARGS(pending_lsn)))); @@ -755,12 +755,12 @@ WaitForWalSummarization(XLogRecPtr lsn) current_time) / 1000; ereport(WARNING, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg_plural("still waiting for WAL summarization through %X/%X after %ld second", - "still waiting for WAL summarization through %X/%X after %ld seconds", + errmsg_plural("still waiting for WAL summarization through %X/%08X after %ld second", + "still waiting for WAL summarization through %X/%08X after %ld seconds", elapsed_seconds, LSN_FORMAT_ARGS(lsn), elapsed_seconds), - errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.", + errdetail("Summarization has reached %X/%08X on disk and %X/%08X in memory.", LSN_FORMAT_ARGS(summarized_lsn), LSN_FORMAT_ARGS(pending_lsn)))); } @@ -981,7 +981,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, if (private_data->end_of_wal) { ereport(DEBUG1, - errmsg_internal("could not read WAL from timeline %u at %X/%X: end of WAL at %X/%X", + errmsg_internal("could not read WAL from timeline %u at %X/%08X: end of WAL at %X/%08X", tli, LSN_FORMAT_ARGS(start_lsn), LSN_FORMAT_ARGS(private_data->read_upto))); @@ -1000,8 +1000,8 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, } else ereport(ERROR, - (errmsg("could not find a valid record after %X/%X", - LSN_FORMAT_ARGS(start_lsn)))); + errmsg("could not find a valid record after %X/%08X", + LSN_FORMAT_ARGS(start_lsn))); } /* We shouldn't go backward. */ @@ -1034,7 +1034,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, * able to read a complete record. */ ereport(DEBUG1, - errmsg_internal("could not read WAL from timeline %u at %X/%X: end of WAL at %X/%X", + errmsg_internal("could not read WAL from timeline %u at %X/%08X: end of WAL at %X/%08X", tli, LSN_FORMAT_ARGS(xlogreader->EndRecPtr), LSN_FORMAT_ARGS(private_data->read_upto))); @@ -1045,13 +1045,13 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, if (errormsg) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not read WAL from timeline %u at %X/%X: %s", + errmsg("could not read WAL from timeline %u at %X/%08X: %s", tli, LSN_FORMAT_ARGS(xlogreader->EndRecPtr), errormsg))); else ereport(ERROR, (errcode_for_file_access(), - errmsg("could not read WAL from timeline %u at %X/%X", + errmsg("could not read WAL from timeline %u at %X/%08X", tli, LSN_FORMAT_ARGS(xlogreader->EndRecPtr)))); } @@ -1222,7 +1222,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, /* Tell the user what we did. */ ereport(DEBUG1, - errmsg_internal("summarized WAL on TLI %u from %X/%X to %X/%X", + errmsg_internal("summarized WAL on TLI %u from %X/%08X to %X/%08X", tli, LSN_FORMAT_ARGS(summary_start_lsn), LSN_FORMAT_ARGS(summary_end_lsn))); @@ -1234,7 +1234,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, /* If we skipped a non-zero amount of WAL, log a debug message. */ if (summary_end_lsn > summary_start_lsn && fast_forward) ereport(DEBUG1, - errmsg_internal("skipped summarizing WAL on TLI %u from %X/%X to %X/%X", + errmsg_internal("skipped summarizing WAL on TLI %u from %X/%08X to %X/%08X", tli, LSN_FORMAT_ARGS(summary_start_lsn), LSN_FORMAT_ARGS(summary_end_lsn))); @@ -1580,7 +1580,7 @@ summarizer_read_local_xlog_page(XLogReaderState *state, /* Debugging output. */ ereport(DEBUG1, - errmsg_internal("timeline %u became historic, can read up to %X/%X", + errmsg_internal("timeline %u became historic, can read up to %X/%08X", private_data->tli, LSN_FORMAT_ARGS(private_data->read_upto))); } diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 78193cfb964e5..d9eab5357bc38 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -20,58 +20,13 @@ #include "common/unicode_category.h" #include "utils/pg_locale.h" -/* - * For the libc provider, to provide as much functionality as possible on a - * variety of platforms without going so far as to implement everything from - * scratch, we use several implementation strategies depending on the - * situation: - * - * 1. In C/POSIX collations, we use hard-wired code. We can't depend on - * the functions since those will obey LC_CTYPE. Note that these - * collations don't give a fig about multibyte characters. - * - * 2. When working in UTF8 encoding, we use the functions. - * This assumes that every platform uses Unicode codepoints directly - * as the wchar_t representation of Unicode. (XXX: ICU makes this assumption - * even for non-UTF8 encodings, which may be a problem.) On some platforms - * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF. - * - * 3. In all other encodings, we use the functions for pg_wchar - * values up to 255, and punt for values above that. This is 100% correct - * only in single-byte encodings such as LATINn. However, non-Unicode - * multibyte encodings are mostly Far Eastern character sets for which the - * properties being tested here aren't very relevant for higher code values - * anyway. The difficulty with using the functions with - * non-Unicode multibyte encodings is that we can have no certainty that - * the platform's wchar_t representation matches what we do in pg_wchar - * conversions. - * - * As a special case, in the "default" collation, (2) and (3) force ASCII - * letters to follow ASCII upcase/downcase rules, while in a non-default - * collation we just let the library functions do what they will. The case - * where this matters is treatment of I/i in Turkish, and the behavior is - * meant to match the upper()/lower() SQL functions. - * - * We store the active collation setting in static variables. In principle - * it could be passed down to here via the regex library's "struct vars" data - * structure; but that would require somewhat invasive changes in the regex - * library, and right now there's no real benefit to be gained from that. - * - * NB: the coding here assumes pg_wchar is an unsigned type. - */ - -typedef enum -{ - PG_REGEX_STRATEGY_C, /* C locale (encoding independent) */ - PG_REGEX_STRATEGY_BUILTIN, /* built-in Unicode semantics */ - PG_REGEX_STRATEGY_LIBC_WIDE, /* Use locale_t functions */ - PG_REGEX_STRATEGY_LIBC_1BYTE, /* Use locale_t functions */ - PG_REGEX_STRATEGY_ICU, /* Use ICU uchar.h functions */ -} PG_Locale_Strategy; - -static PG_Locale_Strategy pg_regex_strategy; static pg_locale_t pg_regex_locale; +static struct pg_locale_struct dummy_c_locale = { + .collate_is_c = true, + .ctype_is_c = true, +}; + /* * Hard-wired character properties for C locale */ @@ -228,7 +183,6 @@ void pg_set_regex_collation(Oid collation) { pg_locale_t locale = 0; - PG_Locale_Strategy strategy; if (!OidIsValid(collation)) { @@ -249,8 +203,7 @@ pg_set_regex_collation(Oid collation) * catalog access is available, so we can't call * pg_newlocale_from_collation(). */ - strategy = PG_REGEX_STRATEGY_C; - locale = 0; + locale = &dummy_c_locale; } else { @@ -267,113 +220,41 @@ pg_set_regex_collation(Oid collation) * C/POSIX collations use this path regardless of database * encoding */ - strategy = PG_REGEX_STRATEGY_C; - locale = 0; - } - else if (locale->provider == COLLPROVIDER_BUILTIN) - { - Assert(GetDatabaseEncoding() == PG_UTF8); - strategy = PG_REGEX_STRATEGY_BUILTIN; - } -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - { - strategy = PG_REGEX_STRATEGY_ICU; - } -#endif - else - { - Assert(locale->provider == COLLPROVIDER_LIBC); - if (GetDatabaseEncoding() == PG_UTF8) - strategy = PG_REGEX_STRATEGY_LIBC_WIDE; - else - strategy = PG_REGEX_STRATEGY_LIBC_1BYTE; + locale = &dummy_c_locale; } } - pg_regex_strategy = strategy; pg_regex_locale = locale; } static int pg_wc_isdigit(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISDIGIT)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isdigit(c, !pg_regex_locale->info.builtin.casemap_full); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswdigit_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isdigit_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isdigit(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISDIGIT)); + else + return pg_regex_locale->ctype->wc_isdigit(c, pg_regex_locale); } static int pg_wc_isalpha(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISALPHA)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isalpha(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswalpha_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isalpha_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isalpha(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISALPHA)); + else + return pg_regex_locale->ctype->wc_isalpha(c, pg_regex_locale); } static int pg_wc_isalnum(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISALNUM)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isalnum(c, !pg_regex_locale->info.builtin.casemap_full); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswalnum_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isalnum_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isalnum(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISALNUM)); + else + return pg_regex_locale->ctype->wc_isalnum(c, pg_regex_locale); } static int @@ -388,231 +269,87 @@ pg_wc_isword(pg_wchar c) static int pg_wc_isupper(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISUPPER)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isupper(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswupper_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isupper_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isupper(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISUPPER)); + else + return pg_regex_locale->ctype->wc_isupper(c, pg_regex_locale); } static int pg_wc_islower(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISLOWER)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_islower(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswlower_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - islower_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_islower(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISLOWER)); + else + return pg_regex_locale->ctype->wc_islower(c, pg_regex_locale); } static int pg_wc_isgraph(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISGRAPH)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isgraph(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswgraph_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isgraph_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isgraph(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISGRAPH)); + else + return pg_regex_locale->ctype->wc_isgraph(c, pg_regex_locale); } static int pg_wc_isprint(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISPRINT)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isprint(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswprint_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isprint_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isprint(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISPRINT)); + else + return pg_regex_locale->ctype->wc_isprint(c, pg_regex_locale); } static int pg_wc_ispunct(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISPUNCT)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_ispunct(c, !pg_regex_locale->info.builtin.casemap_full); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswpunct_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - ispunct_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_ispunct(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISPUNCT)); + else + return pg_regex_locale->ctype->wc_ispunct(c, pg_regex_locale); } static int pg_wc_isspace(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISSPACE)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isspace(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswspace_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isspace_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isspace(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISSPACE)); + else + return pg_regex_locale->ctype->wc_isspace(c, pg_regex_locale); } static pg_wchar pg_wc_toupper(pg_wchar c) { - switch (pg_regex_strategy) + if (pg_regex_locale->ctype_is_c) { - case PG_REGEX_STRATEGY_C: - if (c <= (pg_wchar) 127) - return pg_ascii_toupper((unsigned char) c); - return c; - case PG_REGEX_STRATEGY_BUILTIN: - return unicode_uppercase_simple(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - /* force C behavior for ASCII characters, per comments above */ - if (pg_regex_locale->is_default && c <= (pg_wchar) 127) - return pg_ascii_toupper((unsigned char) c); - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return towupper_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - /* force C behavior for ASCII characters, per comments above */ - if (pg_regex_locale->is_default && c <= (pg_wchar) 127) - return pg_ascii_toupper((unsigned char) c); - if (c <= (pg_wchar) UCHAR_MAX) - return toupper_l((unsigned char) c, pg_regex_locale->info.lt); - return c; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_toupper(c); -#endif - break; + if (c <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) c); + return c; } - return 0; /* can't get here, but keep compiler quiet */ + else + return pg_regex_locale->ctype->wc_toupper(c, pg_regex_locale); } static pg_wchar pg_wc_tolower(pg_wchar c) { - switch (pg_regex_strategy) + if (pg_regex_locale->ctype_is_c) { - case PG_REGEX_STRATEGY_C: - if (c <= (pg_wchar) 127) - return pg_ascii_tolower((unsigned char) c); - return c; - case PG_REGEX_STRATEGY_BUILTIN: - return unicode_lowercase_simple(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - /* force C behavior for ASCII characters, per comments above */ - if (pg_regex_locale->is_default && c <= (pg_wchar) 127) - return pg_ascii_tolower((unsigned char) c); - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return towlower_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - /* force C behavior for ASCII characters, per comments above */ - if (pg_regex_locale->is_default && c <= (pg_wchar) 127) - return pg_ascii_tolower((unsigned char) c); - if (c <= (pg_wchar) UCHAR_MAX) - return tolower_l((unsigned char) c, pg_regex_locale->info.lt); - return c; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_tolower(c); -#endif - break; + if (c <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) c); + return c; } - return 0; /* can't get here, but keep compiler quiet */ + else + return pg_regex_locale->ctype->wc_tolower(c, pg_regex_locale); } @@ -738,37 +475,25 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) * would always be true for production values of MAX_SIMPLE_CHR, but it's * useful to allow it to be small for testing purposes.) */ - switch (pg_regex_strategy) + if (pg_regex_locale->ctype_is_c) { - case PG_REGEX_STRATEGY_C: #if MAX_SIMPLE_CHR >= 127 - max_chr = (pg_wchar) 127; - pcc->cv.cclasscode = -1; + max_chr = (pg_wchar) 127; + pcc->cv.cclasscode = -1; #else - max_chr = (pg_wchar) MAX_SIMPLE_CHR; + max_chr = (pg_wchar) MAX_SIMPLE_CHR; #endif - break; - case PG_REGEX_STRATEGY_BUILTIN: - max_chr = (pg_wchar) MAX_SIMPLE_CHR; - break; - case PG_REGEX_STRATEGY_LIBC_WIDE: - max_chr = (pg_wchar) MAX_SIMPLE_CHR; - break; - case PG_REGEX_STRATEGY_LIBC_1BYTE: -#if MAX_SIMPLE_CHR >= UCHAR_MAX - max_chr = (pg_wchar) UCHAR_MAX; + } + else + { + if (pg_regex_locale->ctype->max_chr != 0 && + pg_regex_locale->ctype->max_chr <= MAX_SIMPLE_CHR) + { + max_chr = pg_regex_locale->ctype->max_chr; pcc->cv.cclasscode = -1; -#else - max_chr = (pg_wchar) MAX_SIMPLE_CHR; -#endif - break; - case PG_REGEX_STRATEGY_ICU: + } + else max_chr = (pg_wchar) MAX_SIMPLE_CHR; - break; - default: - Assert(false); - max_chr = 0; /* can't get here, but keep compiler quiet */ - break; } /* diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c index 7b4ddf7a8f52f..239641bfbb66a 100644 --- a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c +++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c @@ -232,6 +232,9 @@ libpqrcv_connect(const char *conninfo, bool replication, bool logical, errhint("Target server's authentication method must be changed, or set password_required=false in the subscription parameters."))); } + PQsetNoticeReceiver(conn->streamConn, libpqsrv_notice_receiver, + "received message via replication"); + /* * Set always-secure search path for the cases where the connection is * used to run SQL queries, so malicious users can't get control. @@ -418,31 +421,22 @@ libpqrcv_identify_system(WalReceiverConn *conn, TimeLineID *primary_tli) "IDENTIFY_SYSTEM", WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE); if (PQresultStatus(res) != PGRES_TUPLES_OK) - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not receive database system identifier and timeline ID from " "the primary server: %s", pchomp(PQerrorMessage(conn->streamConn))))); - } /* * IDENTIFY_SYSTEM returns 3 columns in 9.3 and earlier, and 4 columns in * 9.4 and onwards. */ if (PQnfields(res) < 3 || PQntuples(res) != 1) - { - int ntuples = PQntuples(res); - int nfields = PQnfields(res); - - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid response from primary server"), errdetail("Could not identify system: got %d rows and %d fields, expected %d rows and %d or more fields.", - ntuples, nfields, 1, 3))); - } + PQntuples(res), PQnfields(res), 1, 3))); primary_sysid = pstrdup(PQgetvalue(res, 0, 0)); *primary_tli = pg_strtoint32(PQgetvalue(res, 0, 1)); PQclear(res); @@ -534,7 +528,7 @@ libpqrcv_startstreaming(WalReceiverConn *conn, if (options->logical) appendStringInfoString(&cmd, " LOGICAL"); - appendStringInfo(&cmd, " %X/%X", LSN_FORMAT_ARGS(options->startpoint)); + appendStringInfo(&cmd, " %X/%08X", LSN_FORMAT_ARGS(options->startpoint)); /* * Additional options are different depending on if we are doing logical @@ -604,13 +598,10 @@ libpqrcv_startstreaming(WalReceiverConn *conn, return false; } else if (PQresultStatus(res) != PGRES_COPY_BOTH) - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not start WAL streaming: %s", pchomp(PQerrorMessage(conn->streamConn))))); - } PQclear(res); return true; } @@ -718,26 +709,17 @@ libpqrcv_readtimelinehistoryfile(WalReceiverConn *conn, cmd, WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE); if (PQresultStatus(res) != PGRES_TUPLES_OK) - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not receive timeline history file from " "the primary server: %s", pchomp(PQerrorMessage(conn->streamConn))))); - } if (PQnfields(res) != 2 || PQntuples(res) != 1) - { - int ntuples = PQntuples(res); - int nfields = PQnfields(res); - - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid response from primary server"), errdetail("Expected 1 tuple with 2 fields, got %d tuples with %d fields.", - ntuples, nfields))); - } + PQntuples(res), PQnfields(res)))); *filename = pstrdup(PQgetvalue(res, 0, 0)); *len = PQgetlength(res, 0, 1); @@ -841,13 +823,10 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer, return -1; } else - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not receive data from WAL stream: %s", pchomp(PQerrorMessage(conn->streamConn))))); - } } if (rawlen < -1) ereport(ERROR, @@ -971,13 +950,10 @@ libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname, pfree(cmd.data); if (PQresultStatus(res) != PGRES_TUPLES_OK) - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not create replication slot \"%s\": %s", slotname, pchomp(PQerrorMessage(conn->streamConn))))); - } if (lsn) *lsn = DatumGetLSN(DirectFunctionCall1Coll(pg_lsn_in, InvalidOid, diff --git a/src/backend/replication/logical/applyparallelworker.c b/src/backend/replication/logical/applyparallelworker.c index d25085d351535..1fa931a74229d 100644 --- a/src/backend/replication/logical/applyparallelworker.c +++ b/src/backend/replication/logical/applyparallelworker.c @@ -441,7 +441,8 @@ pa_launch_parallel_worker(void) MySubscription->name, MyLogicalRepWorker->userid, InvalidOid, - dsm_segment_handle(winfo->dsm_seg)); + dsm_segment_handle(winfo->dsm_seg), + false); if (launched) { diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c index 4aed0dfcebb24..742d9ba68e900 100644 --- a/src/backend/replication/logical/launcher.c +++ b/src/backend/replication/logical/launcher.c @@ -32,6 +32,7 @@ #include "postmaster/interrupt.h" #include "replication/logicallauncher.h" #include "replication/origin.h" +#include "replication/slot.h" #include "replication/walreceiver.h" #include "replication/worker_internal.h" #include "storage/ipc.h" @@ -91,7 +92,6 @@ static dshash_table *last_start_times = NULL; static bool on_commit_launcher_wakeup = false; -static void ApplyLauncherWakeup(void); static void logicalrep_launcher_onexit(int code, Datum arg); static void logicalrep_worker_onexit(int code, Datum arg); static void logicalrep_worker_detach(void); @@ -100,6 +100,9 @@ static int logicalrep_pa_worker_count(Oid subid); static void logicalrep_launcher_attach_dshmem(void); static void ApplyLauncherSetWorkerStartTime(Oid subid, TimestampTz start_time); static TimestampTz ApplyLauncherGetWorkerStartTime(Oid subid); +static void compute_min_nonremovable_xid(LogicalRepWorker *worker, TransactionId *xmin); +static bool acquire_conflict_slot_if_exists(void); +static void advance_conflict_slot_xmin(TransactionId new_xmin); /* @@ -148,6 +151,7 @@ get_subscription_list(void) sub->owner = subform->subowner; sub->enabled = subform->subenabled; sub->name = pstrdup(NameStr(subform->subname)); + sub->retaindeadtuples = subform->subretaindeadtuples; /* We don't fill fields we are not interested in. */ res = lappend(res, sub); @@ -309,7 +313,8 @@ logicalrep_workers_find(Oid subid, bool only_running, bool acquire_lock) bool logicalrep_worker_launch(LogicalRepWorkerType wtype, Oid dbid, Oid subid, const char *subname, Oid userid, - Oid relid, dsm_handle subworker_dsm) + Oid relid, dsm_handle subworker_dsm, + bool retain_dead_tuples) { BackgroundWorker bgw; BackgroundWorkerHandle *bgw_handle; @@ -328,10 +333,13 @@ logicalrep_worker_launch(LogicalRepWorkerType wtype, * - must be valid worker type * - tablesync workers are only ones to have relid * - parallel apply worker is the only kind of subworker + * - The replication slot used in conflict detection is created when + * retain_dead_tuples is enabled */ Assert(wtype != WORKERTYPE_UNKNOWN); Assert(is_tablesync_worker == OidIsValid(relid)); Assert(is_parallel_apply_worker == (subworker_dsm != DSM_HANDLE_INVALID)); + Assert(!retain_dead_tuples || MyReplicationSlot); ereport(DEBUG1, (errmsg_internal("starting logical replication worker for subscription \"%s\"", @@ -454,6 +462,9 @@ logicalrep_worker_launch(LogicalRepWorkerType wtype, worker->stream_fileset = NULL; worker->leader_pid = is_parallel_apply_worker ? MyProcPid : InvalidPid; worker->parallel_apply = is_parallel_apply_worker; + worker->oldest_nonremovable_xid = retain_dead_tuples + ? MyReplicationSlot->data.xmin + : InvalidTransactionId; worker->last_lsn = InvalidXLogRecPtr; TIMESTAMP_NOBEGIN(worker->last_send_time); TIMESTAMP_NOBEGIN(worker->last_recv_time); @@ -1118,7 +1129,10 @@ ApplyLauncherWakeupAtCommit(void) on_commit_launcher_wakeup = true; } -static void +/* + * Wakeup the launcher immediately. + */ +void ApplyLauncherWakeup(void) { if (LogicalRepCtx->launcher_pid != 0) @@ -1150,6 +1164,12 @@ ApplyLauncherMain(Datum main_arg) */ BackgroundWorkerInitializeConnection(NULL, NULL, 0); + /* + * Acquire the conflict detection slot at startup to ensure it can be + * dropped if no longer needed after a restart. + */ + acquire_conflict_slot_if_exists(); + /* Enter main loop */ for (;;) { @@ -1159,6 +1179,9 @@ ApplyLauncherMain(Datum main_arg) MemoryContext subctx; MemoryContext oldctx; long wait_time = DEFAULT_NAPTIME_PER_CYCLE; + bool can_advance_xmin = true; + bool retain_dead_tuples = false; + TransactionId xmin = InvalidTransactionId; CHECK_FOR_INTERRUPTS(); @@ -1168,7 +1191,14 @@ ApplyLauncherMain(Datum main_arg) ALLOCSET_DEFAULT_SIZES); oldctx = MemoryContextSwitchTo(subctx); - /* Start any missing workers for enabled subscriptions. */ + /* + * Start any missing workers for enabled subscriptions. + * + * Also, during the iteration through all subscriptions, we compute + * the minimum XID required to protect deleted tuples for conflict + * detection if one of the subscription enables retain_dead_tuples + * option. + */ sublist = get_subscription_list(); foreach(lc, sublist) { @@ -1178,6 +1208,38 @@ ApplyLauncherMain(Datum main_arg) TimestampTz now; long elapsed; + if (sub->retaindeadtuples) + { + retain_dead_tuples = true; + + /* + * Can't advance xmin of the slot unless all the subscriptions + * with retain_dead_tuples are enabled. This is required to + * ensure that we don't advance the xmin of + * CONFLICT_DETECTION_SLOT if one of the subscriptions is not + * enabled. Otherwise, we won't be able to detect conflicts + * reliably for such a subscription even though it has set the + * retain_dead_tuples option. + */ + can_advance_xmin &= sub->enabled; + + /* + * Create a replication slot to retain information necessary + * for conflict detection such as dead tuples, commit + * timestamps, and origins. + * + * The slot is created before starting the apply worker to + * prevent it from unnecessarily maintaining its + * oldest_nonremovable_xid. + * + * The slot is created even for a disabled subscription to + * ensure that conflict-related information is available when + * applying remote changes that occurred before the + * subscription was enabled. + */ + CreateConflictDetectionSlot(); + } + if (!sub->enabled) continue; @@ -1186,7 +1248,27 @@ ApplyLauncherMain(Datum main_arg) LWLockRelease(LogicalRepWorkerLock); if (w != NULL) - continue; /* worker is running already */ + { + /* + * Compute the minimum xmin required to protect dead tuples + * required for conflict detection among all running apply + * workers that enables retain_dead_tuples. + */ + if (sub->retaindeadtuples && can_advance_xmin) + compute_min_nonremovable_xid(w, &xmin); + + /* worker is running already */ + continue; + } + + /* + * Can't advance xmin of the slot unless all the workers + * corresponding to subscriptions with retain_dead_tuples are + * running, disabling the further computation of the minimum + * nonremovable xid. + */ + if (sub->retaindeadtuples) + can_advance_xmin = false; /* * If the worker is eligible to start now, launch it. Otherwise, @@ -1210,7 +1292,8 @@ ApplyLauncherMain(Datum main_arg) if (!logicalrep_worker_launch(WORKERTYPE_APPLY, sub->dbid, sub->oid, sub->name, sub->owner, InvalidOid, - DSM_HANDLE_INVALID)) + DSM_HANDLE_INVALID, + sub->retaindeadtuples)) { /* * We get here either if we failed to launch a worker @@ -1230,6 +1313,20 @@ ApplyLauncherMain(Datum main_arg) } } + /* + * Drop the CONFLICT_DETECTION_SLOT slot if there is no subscription + * that requires us to retain dead tuples. Otherwise, if required, + * advance the slot's xmin to protect dead tuples required for the + * conflict detection. + */ + if (MyReplicationSlot) + { + if (!retain_dead_tuples) + ReplicationSlotDropAcquired(); + else if (can_advance_xmin) + advance_conflict_slot_xmin(xmin); + } + /* Switch back to original memory context. */ MemoryContextSwitchTo(oldctx); /* Clean the temporary memory. */ @@ -1257,6 +1354,125 @@ ApplyLauncherMain(Datum main_arg) /* Not reachable */ } +/* + * Determine the minimum non-removable transaction ID across all apply workers + * for subscriptions that have retain_dead_tuples enabled. Store the result + * in *xmin. + */ +static void +compute_min_nonremovable_xid(LogicalRepWorker *worker, TransactionId *xmin) +{ + TransactionId nonremovable_xid; + + Assert(worker != NULL); + + /* + * The replication slot for conflict detection must be created before the + * worker starts. + */ + Assert(MyReplicationSlot); + + SpinLockAcquire(&worker->relmutex); + nonremovable_xid = worker->oldest_nonremovable_xid; + SpinLockRelease(&worker->relmutex); + + Assert(TransactionIdIsValid(nonremovable_xid)); + + if (!TransactionIdIsValid(*xmin) || + TransactionIdPrecedes(nonremovable_xid, *xmin)) + *xmin = nonremovable_xid; +} + +/* + * Acquire the replication slot used to retain information for conflict + * detection, if it exists. + * + * Return true if successfully acquired, otherwise return false. + */ +static bool +acquire_conflict_slot_if_exists(void) +{ + if (!SearchNamedReplicationSlot(CONFLICT_DETECTION_SLOT, true)) + return false; + + ReplicationSlotAcquire(CONFLICT_DETECTION_SLOT, true, false); + return true; +} + +/* + * Advance the xmin the replication slot used to retain information required + * for conflict detection. + */ +static void +advance_conflict_slot_xmin(TransactionId new_xmin) +{ + Assert(MyReplicationSlot); + Assert(TransactionIdIsValid(new_xmin)); + Assert(TransactionIdPrecedesOrEquals(MyReplicationSlot->data.xmin, new_xmin)); + + /* Return if the xmin value of the slot cannot be advanced */ + if (TransactionIdEquals(MyReplicationSlot->data.xmin, new_xmin)) + return; + + SpinLockAcquire(&MyReplicationSlot->mutex); + MyReplicationSlot->effective_xmin = new_xmin; + MyReplicationSlot->data.xmin = new_xmin; + SpinLockRelease(&MyReplicationSlot->mutex); + + elog(DEBUG1, "updated xmin: %u", MyReplicationSlot->data.xmin); + + ReplicationSlotMarkDirty(); + ReplicationSlotsComputeRequiredXmin(false); + + /* + * Like PhysicalConfirmReceivedLocation(), do not save slot information + * each time. This is acceptable because all concurrent transactions on + * the publisher that require the data preceding the slot's xmin should + * have already been applied and flushed on the subscriber before the xmin + * is advanced. So, even if the slot's xmin regresses after a restart, it + * will be advanced again in the next cycle. Therefore, no data required + * for conflict detection will be prematurely removed. + */ + return; +} + +/* + * Create and acquire the replication slot used to retain information for + * conflict detection, if not yet. + */ +void +CreateConflictDetectionSlot(void) +{ + TransactionId xmin_horizon; + + /* Exit early, if the replication slot is already created and acquired */ + if (MyReplicationSlot) + return; + + ereport(LOG, + errmsg("creating replication conflict detection slot")); + + ReplicationSlotCreate(CONFLICT_DETECTION_SLOT, false, RS_PERSISTENT, false, + false, false); + + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + xmin_horizon = GetOldestSafeDecodingTransactionId(false); + + SpinLockAcquire(&MyReplicationSlot->mutex); + MyReplicationSlot->effective_xmin = xmin_horizon; + MyReplicationSlot->data.xmin = xmin_horizon; + SpinLockRelease(&MyReplicationSlot->mutex); + + ReplicationSlotsComputeRequiredXmin(true); + + LWLockRelease(ProcArrayLock); + + /* Write this slot to disk */ + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); +} + /* * Is current process the logical replication launcher? */ diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index f1eb798f3e97a..7e363a7c05b4f 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -567,7 +567,7 @@ CreateDecodingContext(XLogRecPtr start_lsn, * kinds of client errors; so the client may wish to check that * confirmed_flush_lsn matches its expectations. */ - elog(LOG, "%X/%X has been already streamed, forwarding to %X/%X", + elog(LOG, "%X/%08X has been already streamed, forwarding to %X/%08X", LSN_FORMAT_ARGS(start_lsn), LSN_FORMAT_ARGS(slot->data.confirmed_flush)); @@ -610,7 +610,7 @@ CreateDecodingContext(XLogRecPtr start_lsn, ereport(LOG, (errmsg("starting logical decoding for slot \"%s\"", NameStr(slot->data.name)), - errdetail("Streaming transactions committing after %X/%X, reading WAL from %X/%X.", + errdetail("Streaming transactions committing after %X/%08X, reading WAL from %X/%08X.", LSN_FORMAT_ARGS(slot->data.confirmed_flush), LSN_FORMAT_ARGS(slot->data.restart_lsn)))); @@ -637,7 +637,7 @@ DecodingContextFindStartpoint(LogicalDecodingContext *ctx) /* Initialize from where to start reading WAL. */ XLogBeginRead(ctx->reader, slot->data.restart_lsn); - elog(DEBUG1, "searching for logical decoding starting point, starting at %X/%X", + elog(DEBUG1, "searching for logical decoding starting point, starting at %X/%08X", LSN_FORMAT_ARGS(slot->data.restart_lsn)); /* Wait for a consistent starting point */ @@ -758,7 +758,7 @@ output_plugin_error_callback(void *arg) /* not all callbacks have an associated LSN */ if (state->report_location != InvalidXLogRecPtr) - errcontext("slot \"%s\", output plugin \"%s\", in the %s callback, associated LSN %X/%X", + errcontext("slot \"%s\", output plugin \"%s\", in the %s callback, associated LSN %X/%08X", NameStr(state->ctx->slot->data.name), NameStr(state->ctx->slot->data.plugin), state->callback_name, @@ -1725,7 +1725,7 @@ LogicalIncreaseXminForSlot(XLogRecPtr current_lsn, TransactionId xmin) SpinLockRelease(&slot->mutex); if (got_new_xmin) - elog(DEBUG1, "got new catalog xmin %u at %X/%X", xmin, + elog(DEBUG1, "got new catalog xmin %u at %X/%08X", xmin, LSN_FORMAT_ARGS(current_lsn)); /* candidate already valid with the current flush position, apply */ @@ -1785,7 +1785,7 @@ LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart slot->candidate_restart_lsn = restart_lsn; SpinLockRelease(&slot->mutex); - elog(DEBUG1, "got new restart lsn %X/%X at %X/%X", + elog(DEBUG1, "got new restart lsn %X/%08X at %X/%08X", LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(current_lsn)); } @@ -1800,7 +1800,7 @@ LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart confirmed_flush = slot->data.confirmed_flush; SpinLockRelease(&slot->mutex); - elog(DEBUG1, "failed to increase restart lsn: proposed %X/%X, after %X/%X, current candidate %X/%X, current after %X/%X, flushed up to %X/%X", + elog(DEBUG1, "failed to increase restart lsn: proposed %X/%08X, after %X/%08X, current candidate %X/%08X, current after %X/%08X, flushed up to %X/%08X", LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(current_lsn), LSN_FORMAT_ARGS(candidate_restart_lsn), diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c index a17bacf88e7f3..87f10e50dcc4d 100644 --- a/src/backend/replication/logical/origin.c +++ b/src/backend/replication/logical/origin.c @@ -826,9 +826,9 @@ StartupReplicationOrigin(void) last_state++; ereport(LOG, - (errmsg("recovered replication state of node %d to %X/%X", - disk_state.roident, - LSN_FORMAT_ARGS(disk_state.remote_lsn)))); + errmsg("recovered replication state of node %d to %X/%08X", + disk_state.roident, + LSN_FORMAT_ARGS(disk_state.remote_lsn))); } /* now check checksum */ diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index c4299c76fb16b..5febd154b6bae 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -1415,7 +1415,7 @@ ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state) int32 off; /* nothing there anymore */ - if (state->heap->bh_size == 0) + if (binaryheap_empty(state->heap)) return NULL; off = DatumGetInt32(binaryheap_first(state->heap)); @@ -4917,7 +4917,7 @@ StartupReorderBuffer(void) continue; /* if it cannot be a slot, skip the directory */ - if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2)) + if (!ReplicationSlotValidateName(logical_de->d_name, true, DEBUG2)) continue; /* diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c index 3ec3abfa3da60..2f0c08b8fbd33 100644 --- a/src/backend/replication/logical/slotsync.c +++ b/src/backend/replication/logical/slotsync.c @@ -213,7 +213,7 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid, ereport(slot->data.persistency == RS_TEMPORARY ? LOG : DEBUG1, errmsg("could not synchronize replication slot \"%s\"", remote_slot->name), - errdetail("Synchronization could lead to data loss, because the remote slot needs WAL at LSN %X/%X and catalog xmin %u, but the standby has LSN %X/%X and catalog xmin %u.", + errdetail("Synchronization could lead to data loss, because the remote slot needs WAL at LSN %X/%08X and catalog xmin %u, but the standby has LSN %X/%08X and catalog xmin %u.", LSN_FORMAT_ARGS(remote_slot->restart_lsn), remote_slot->catalog_xmin, LSN_FORMAT_ARGS(slot->data.restart_lsn), @@ -275,7 +275,7 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid, ereport(ERROR, errmsg_internal("synchronized confirmed_flush for slot \"%s\" differs from remote slot", remote_slot->name), - errdetail_internal("Remote slot has LSN %X/%X but local slot has LSN %X/%X.", + errdetail_internal("Remote slot has LSN %X/%08X but local slot has LSN %X/%08X.", LSN_FORMAT_ARGS(remote_slot->confirmed_lsn), LSN_FORMAT_ARGS(slot->data.confirmed_flush))); } @@ -593,7 +593,7 @@ update_and_persist_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid) { ereport(LOG, errmsg("could not synchronize replication slot \"%s\"", remote_slot->name), - errdetail("Synchronization could lead to data loss, because the standby could not build a consistent snapshot to decode WALs at LSN %X/%X.", + errdetail("Synchronization could lead to data loss, because the standby could not build a consistent snapshot to decode WALs at LSN %X/%08X.", LSN_FORMAT_ARGS(slot->data.restart_lsn))); return false; @@ -642,7 +642,7 @@ synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid) ereport(AmLogicalSlotSyncWorkerProcess() ? LOG : ERROR, errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("skipping slot synchronization because the received slot sync" - " LSN %X/%X for slot \"%s\" is ahead of the standby position %X/%X", + " LSN %X/%08X for slot \"%s\" is ahead of the standby position %X/%08X", LSN_FORMAT_ARGS(remote_slot->confirmed_lsn), remote_slot->name, LSN_FORMAT_ARGS(latestFlushPtr))); @@ -733,7 +733,7 @@ synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid) ereport(ERROR, errmsg_internal("cannot synchronize local slot \"%s\"", remote_slot->name), - errdetail_internal("Local slot's start streaming location LSN(%X/%X) is ahead of remote slot's LSN(%X/%X).", + errdetail_internal("Local slot's start streaming location LSN(%X/%08X) is ahead of remote slot's LSN(%X/%08X).", LSN_FORMAT_ARGS(slot->data.confirmed_flush), LSN_FORMAT_ARGS(remote_slot->confirmed_lsn))); diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index adf18c397db43..8532bfd27e53f 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -774,7 +774,7 @@ SnapBuildDistributeSnapshotAndInval(SnapBuild *builder, XLogRecPtr lsn, Transact if (rbtxn_is_prepared(txn)) continue; - elog(DEBUG2, "adding a new snapshot and invalidations to %u at %X/%X", + elog(DEBUG2, "adding a new snapshot and invalidations to %u at %X/%08X", txn->xid, LSN_FORMAT_ARGS(lsn)); /* @@ -1271,10 +1271,10 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn builder->initial_xmin_horizon)) { ereport(DEBUG1, - (errmsg_internal("skipping snapshot at %X/%X while building logical decoding snapshot, xmin horizon too low", - LSN_FORMAT_ARGS(lsn)), - errdetail_internal("initial xmin horizon of %u vs the snapshot's %u", - builder->initial_xmin_horizon, running->oldestRunningXid))); + errmsg_internal("skipping snapshot at %X/%08X while building logical decoding snapshot, xmin horizon too low", + LSN_FORMAT_ARGS(lsn)), + errdetail_internal("initial xmin horizon of %u vs the snapshot's %u", + builder->initial_xmin_horizon, running->oldestRunningXid)); SnapBuildWaitSnapshot(running, builder->initial_xmin_horizon); @@ -1310,9 +1310,9 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn builder->next_phase_at = InvalidTransactionId; ereport(LOG, - (errmsg("logical decoding found consistent point at %X/%X", - LSN_FORMAT_ARGS(lsn)), - errdetail("There are no running transactions."))); + errmsg("logical decoding found consistent point at %X/%08X", + LSN_FORMAT_ARGS(lsn)), + errdetail("There are no running transactions.")); return false; } @@ -1359,10 +1359,10 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn Assert(TransactionIdIsNormal(builder->xmax)); ereport(LOG, - (errmsg("logical decoding found initial starting point at %X/%X", - LSN_FORMAT_ARGS(lsn)), - errdetail("Waiting for transactions (approximately %d) older than %u to end.", - running->xcnt, running->nextXid))); + errmsg("logical decoding found initial starting point at %X/%08X", + LSN_FORMAT_ARGS(lsn)), + errdetail("Waiting for transactions (approximately %d) older than %u to end.", + running->xcnt, running->nextXid)); SnapBuildWaitSnapshot(running, running->nextXid); } @@ -1383,10 +1383,10 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn builder->next_phase_at = running->nextXid; ereport(LOG, - (errmsg("logical decoding found initial consistent point at %X/%X", - LSN_FORMAT_ARGS(lsn)), - errdetail("Waiting for transactions (approximately %d) older than %u to end.", - running->xcnt, running->nextXid))); + errmsg("logical decoding found initial consistent point at %X/%08X", + LSN_FORMAT_ARGS(lsn)), + errdetail("Waiting for transactions (approximately %d) older than %u to end.", + running->xcnt, running->nextXid)); SnapBuildWaitSnapshot(running, running->nextXid); } @@ -1407,9 +1407,9 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn builder->next_phase_at = InvalidTransactionId; ereport(LOG, - (errmsg("logical decoding found consistent point at %X/%X", - LSN_FORMAT_ARGS(lsn)), - errdetail("There are no old transactions anymore."))); + errmsg("logical decoding found consistent point at %X/%08X", + LSN_FORMAT_ARGS(lsn)), + errdetail("There are no old transactions anymore.")); } /* @@ -1913,9 +1913,9 @@ SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn) Assert(builder->state == SNAPBUILD_CONSISTENT); ereport(LOG, - (errmsg("logical decoding found consistent point at %X/%X", - LSN_FORMAT_ARGS(lsn)), - errdetail("Logical decoding will begin using saved snapshot."))); + errmsg("logical decoding found consistent point at %X/%08X", + LSN_FORMAT_ARGS(lsn)), + errdetail("Logical decoding will begin using saved snapshot.")); return true; snapshot_not_interesting: @@ -2061,7 +2061,7 @@ SnapBuildSnapshotExists(XLogRecPtr lsn) int ret; struct stat stat_buf; - sprintf(path, "%s/%X-%X.snap", + sprintf(path, "%s/%08X-%08X.snap", PG_LOGICAL_SNAPSHOTS_DIR, LSN_FORMAT_ARGS(lsn)); diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c index c90f23ee5b0b2..3fea0a0206ed3 100644 --- a/src/backend/replication/logical/tablesync.c +++ b/src/backend/replication/logical/tablesync.c @@ -615,7 +615,8 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) MySubscription->name, MyLogicalRepWorker->userid, rstate->relid, - DSM_HANDLE_INVALID); + DSM_HANDLE_INVALID, + false); } } } @@ -1553,7 +1554,7 @@ LogicalRepSyncTableStart(XLogRecPtr *origin_startpos) copy_table_done: elog(DEBUG1, - "LogicalRepSyncTableStart: '%s' origin_startpos lsn %X/%X", + "LogicalRepSyncTableStart: '%s' origin_startpos lsn %X/%08X", originname, LSN_FORMAT_ARGS(*origin_startpos)); /* diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index fd11805a44cf9..b59221c4d0636 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -132,6 +132,96 @@ * failover = true when creating the subscription. Enabling failover allows us * to smoothly transition to the promoted standby, ensuring that we can * subscribe to the new primary without losing any data. + * + * RETAIN DEAD TUPLES + * ---------------------- + * Each apply worker that enabled retain_dead_tuples option maintains a + * non-removable transaction ID (oldest_nonremovable_xid) in shared memory to + * prevent dead rows from being removed prematurely when the apply worker still + * needs them to detect conflicts reliably. This helps to retain the required + * commit_ts module information, which further helps to detect + * update_origin_differs and delete_origin_differs conflicts reliably, as + * otherwise, vacuum freeze could remove the required information. + * + * The logical replication launcher manages an internal replication slot named + * "pg_conflict_detection". It asynchronously aggregates the non-removable + * transaction ID from all apply workers to determine the appropriate xmin for + * the slot, thereby retaining necessary tuples. + * + * The non-removable transaction ID in the apply worker is advanced to the + * oldest running transaction ID once all concurrent transactions on the + * publisher have been applied and flushed locally. The process involves: + * + * - RDT_GET_CANDIDATE_XID: + * Call GetOldestActiveTransactionId() to take oldestRunningXid as the + * candidate xid. + * + * - RDT_REQUEST_PUBLISHER_STATUS: + * Send a message to the walsender requesting the publisher status, which + * includes the latest WAL write position and information about transactions + * that are in the commit phase. + * + * - RDT_WAIT_FOR_PUBLISHER_STATUS: + * Wait for the status from the walsender. After receiving the first status, + * do not proceed if there are concurrent remote transactions that are still + * in the commit phase. These transactions might have been assigned an + * earlier commit timestamp but have not yet written the commit WAL record. + * Continue to request the publisher status (RDT_REQUEST_PUBLISHER_STATUS) + * until all these transactions have completed. + * + * - RDT_WAIT_FOR_LOCAL_FLUSH: + * Advance the non-removable transaction ID if the current flush location has + * reached or surpassed the last received WAL position. + * + * The overall state progression is: GET_CANDIDATE_XID -> + * REQUEST_PUBLISHER_STATUS -> WAIT_FOR_PUBLISHER_STATUS -> (loop to + * REQUEST_PUBLISHER_STATUS till concurrent remote transactions end) -> + * WAIT_FOR_LOCAL_FLUSH -> loop back to GET_CANDIDATE_XID. + * + * Retaining the dead tuples for this period is sufficient for ensuring + * eventual consistency using last-update-wins strategy, as dead tuples are + * useful for detecting conflicts only during the application of concurrent + * transactions from remote nodes. After applying and flushing all remote + * transactions that occurred concurrently with the tuple DELETE, any + * subsequent UPDATE from a remote node should have a later timestamp. In such + * cases, it is acceptable to detect an update_missing scenario and convert the + * UPDATE to an INSERT when applying it. But, detecting concurrent remote + * transactions with earlier timestamps than the DELETE is necessary, as the + * UPDATEs in remote transactions should be ignored if their timestamp is + * earlier than that of the dead tuples. + * + * Note that advancing the non-removable transaction ID is not supported if the + * publisher is also a physical standby. This is because the logical walsender + * on the standby can only get the WAL replay position but there may be more + * WALs that are being replicated from the primary and those WALs could have + * earlier commit timestamp. + * + * Similarly, when the publisher has subscribed to another publisher, + * information necessary for conflict detection cannot be retained for + * changes from origins other than the publisher. This is because publisher + * lacks the information on concurrent transactions of other publishers to + * which it subscribes. As the information on concurrent transactions is + * unavailable beyond subscriber's immediate publishers, the non-removable + * transaction ID might be advanced prematurely before changes from other + * origins have been fully applied. + * + * XXX Retaining information for changes from other origins might be possible + * by requesting the subscription on that origin to enable retain_dead_tuples + * and fetching the conflict detection slot.xmin along with the publisher's + * status. In the RDT_WAIT_FOR_PUBLISHER_STATUS phase, the apply worker could + * wait for the remote slot's xmin to reach the oldest active transaction ID, + * ensuring that all transactions from other origins have been applied on the + * publisher, thereby getting the latest WAL position that includes all + * concurrent changes. However, this approach may impact performance, so it + * might not worth the effort. + * + * XXX It seems feasible to get the latest commit's WAL location from the + * publisher and wait till that is applied. However, we can't do that + * because commit timestamps can regress as a commit with a later LSN is not + * guaranteed to have a later timestamp than those with earlier LSNs. Having + * said that, even if that is possible, it won't improve performance much as + * the apply always lag and moves slowly as compared with the transactions + * on the publisher. *------------------------------------------------------------------------- */ @@ -140,6 +230,7 @@ #include #include +#include "access/commit_ts.h" #include "access/table.h" #include "access/tableam.h" #include "access/twophase.h" @@ -148,6 +239,7 @@ #include "catalog/pg_inherits.h" #include "catalog/pg_subscription.h" #include "catalog/pg_subscription_rel.h" +#include "commands/subscriptioncmds.h" #include "commands/tablecmds.h" #include "commands/trigger.h" #include "executor/executor.h" @@ -166,12 +258,14 @@ #include "replication/logicalrelation.h" #include "replication/logicalworker.h" #include "replication/origin.h" +#include "replication/slot.h" #include "replication/walreceiver.h" #include "replication/worker_internal.h" #include "rewrite/rewriteHandler.h" #include "storage/buffile.h" #include "storage/ipc.h" #include "storage/lmgr.h" +#include "storage/procarray.h" #include "tcop/tcopprot.h" #include "utils/acl.h" #include "utils/dynahash.h" @@ -268,6 +362,78 @@ typedef enum TRANS_PARALLEL_APPLY, } TransApplyAction; +/* + * The phases involved in advancing the non-removable transaction ID. + * + * See comments atop worker.c for details of the transition between these + * phases. + */ +typedef enum +{ + RDT_GET_CANDIDATE_XID, + RDT_REQUEST_PUBLISHER_STATUS, + RDT_WAIT_FOR_PUBLISHER_STATUS, + RDT_WAIT_FOR_LOCAL_FLUSH +} RetainDeadTuplesPhase; + +/* + * Critical information for managing phase transitions within the + * RetainDeadTuplesPhase. + */ +typedef struct RetainDeadTuplesData +{ + RetainDeadTuplesPhase phase; /* current phase */ + XLogRecPtr remote_lsn; /* WAL write position on the publisher */ + + /* + * Oldest transaction ID that was in the commit phase on the publisher. + * Use FullTransactionId to prevent issues with transaction ID wraparound, + * where a new remote_oldestxid could falsely appear to originate from the + * past and block advancement. + */ + FullTransactionId remote_oldestxid; + + /* + * Next transaction ID to be assigned on the publisher. Use + * FullTransactionId for consistency and to allow straightforward + * comparisons with remote_oldestxid. + */ + FullTransactionId remote_nextxid; + + TimestampTz reply_time; /* when the publisher responds with status */ + + /* + * Publisher transaction ID that must be awaited to complete before + * entering the final phase (RDT_WAIT_FOR_LOCAL_FLUSH). Use + * FullTransactionId for the same reason as remote_nextxid. + */ + FullTransactionId remote_wait_for; + + TransactionId candidate_xid; /* candidate for the non-removable + * transaction ID */ + TimestampTz flushpos_update_time; /* when the remote flush position was + * updated in final phase + * (RDT_WAIT_FOR_LOCAL_FLUSH) */ + + /* + * The following fields are used to determine the timing for the next + * round of transaction ID advancement. + */ + TimestampTz last_recv_time; /* when the last message was received */ + TimestampTz candidate_xid_time; /* when the candidate_xid is decided */ + int xid_advance_interval; /* how much time (ms) to wait before + * attempting to advance the + * non-removable transaction ID */ +} RetainDeadTuplesData; + +/* + * The minimum (100ms) and maximum (3 minutes) intervals for advancing + * non-removable transaction IDs. The maximum interval is a bit arbitrary but + * is sufficient to not cause any undue network traffic. + */ +#define MIN_XID_ADVANCE_INTERVAL 100 +#define MAX_XID_ADVANCE_INTERVAL 180000 + /* errcontext tracker */ static ApplyErrorCallbackArg apply_error_callback_arg = { @@ -332,6 +498,13 @@ static XLogRecPtr skip_xact_finish_lsn = InvalidXLogRecPtr; /* BufFile handle of the current streaming file */ static BufFile *stream_fd = NULL; +/* + * The remote WAL position that has been applied and flushed locally. We record + * and use this information both while sending feedback to the server and + * advancing oldest_nonremovable_xid. + */ +static XLogRecPtr last_flushpos = InvalidXLogRecPtr; + typedef struct SubXactInfo { TransactionId xid; /* XID of the subxact */ @@ -372,6 +545,19 @@ static void stream_close_file(void); static void send_feedback(XLogRecPtr recvpos, bool force, bool requestReply); +static void maybe_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data, + bool status_received); +static bool can_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data); +static void process_rdt_phase_transition(RetainDeadTuplesData *rdt_data, + bool status_received); +static void get_candidate_xid(RetainDeadTuplesData *rdt_data); +static void request_publisher_status(RetainDeadTuplesData *rdt_data); +static void wait_for_publisher_status(RetainDeadTuplesData *rdt_data, + bool status_received); +static void wait_for_local_flush(RetainDeadTuplesData *rdt_data); +static void adjust_xid_advance_interval(RetainDeadTuplesData *rdt_data, + bool new_xid_found); + static void apply_handle_commit_internal(LogicalRepCommitData *commit_data); static void apply_handle_insert_internal(ApplyExecutionData *edata, ResultRelInfo *relinfo, @@ -1016,7 +1202,7 @@ apply_handle_commit(StringInfo s) if (commit_data.commit_lsn != remote_final_lsn) ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg_internal("incorrect commit LSN %X/%X in commit message (expected %X/%X)", + errmsg_internal("incorrect commit LSN %X/%08X in commit message (expected %X/%08X)", LSN_FORMAT_ARGS(commit_data.commit_lsn), LSN_FORMAT_ARGS(remote_final_lsn)))); @@ -1108,7 +1294,7 @@ apply_handle_prepare(StringInfo s) if (prepare_data.prepare_lsn != remote_final_lsn) ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg_internal("incorrect prepare LSN %X/%X in prepare message (expected %X/%X)", + errmsg_internal("incorrect prepare LSN %X/%08X in prepare message (expected %X/%08X)", LSN_FORMAT_ARGS(prepare_data.prepare_lsn), LSN_FORMAT_ARGS(remote_final_lsn)))); @@ -3577,6 +3763,7 @@ LogicalRepApplyLoop(XLogRecPtr last_received) bool ping_sent = false; TimeLineID tli; ErrorContextCallback errcallback; + RetainDeadTuplesData rdt_data = {0}; /* * Init the ApplyMessageContext which we clean up after each replication @@ -3655,6 +3842,8 @@ LogicalRepApplyLoop(XLogRecPtr last_received) last_recv_timestamp = GetCurrentTimestamp(); ping_sent = false; + rdt_data.last_recv_time = last_recv_timestamp; + /* Ensure we are reading the data into our memory context. */ MemoryContextSwitchTo(ApplyMessageContext); @@ -3681,6 +3870,8 @@ LogicalRepApplyLoop(XLogRecPtr last_received) UpdateWorkerStats(last_received, send_time, false); apply_dispatch(&s); + + maybe_advance_nonremovable_xid(&rdt_data, false); } else if (c == 'k') { @@ -3696,8 +3887,31 @@ LogicalRepApplyLoop(XLogRecPtr last_received) last_received = end_lsn; send_feedback(last_received, reply_requested, false); + + maybe_advance_nonremovable_xid(&rdt_data, false); + UpdateWorkerStats(last_received, timestamp, true); } + else if (c == 's') /* Primary status update */ + { + rdt_data.remote_lsn = pq_getmsgint64(&s); + rdt_data.remote_oldestxid = FullTransactionIdFromU64((uint64) pq_getmsgint64(&s)); + rdt_data.remote_nextxid = FullTransactionIdFromU64((uint64) pq_getmsgint64(&s)); + rdt_data.reply_time = pq_getmsgint64(&s); + + /* + * This should never happen, see + * ProcessStandbyPSRequestMessage. But if it happens + * due to a bug, we don't want to proceed as it can + * incorrectly advance oldest_nonremovable_xid. + */ + if (XLogRecPtrIsInvalid(rdt_data.remote_lsn)) + elog(ERROR, "cannot get the latest WAL position from the publisher"); + + maybe_advance_nonremovable_xid(&rdt_data, true); + + UpdateWorkerStats(last_received, rdt_data.reply_time, false); + } /* other message types are purposefully ignored */ MemoryContextReset(ApplyMessageContext); @@ -3710,6 +3924,11 @@ LogicalRepApplyLoop(XLogRecPtr last_received) /* confirm all writes so far */ send_feedback(last_received, false, false); + /* Reset the timestamp if no message was received */ + rdt_data.last_recv_time = 0; + + maybe_advance_nonremovable_xid(&rdt_data, false); + if (!in_remote_transaction && !in_streamed_transaction) { /* @@ -3744,6 +3963,14 @@ LogicalRepApplyLoop(XLogRecPtr last_received) else wait_time = NAPTIME_PER_CYCLE; + /* + * Ensure to wake up when it's possible to advance the non-removable + * transaction ID. + */ + if (rdt_data.phase == RDT_GET_CANDIDATE_XID && + rdt_data.xid_advance_interval) + wait_time = Min(wait_time, rdt_data.xid_advance_interval); + rc = WaitLatchOrSocket(MyLatch, WL_SOCKET_READABLE | WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, @@ -3807,6 +4034,8 @@ LogicalRepApplyLoop(XLogRecPtr last_received) send_feedback(last_received, requestReply, requestReply); + maybe_advance_nonremovable_xid(&rdt_data, false); + /* * Force reporting to ensure long idle periods don't lead to * arbitrarily delayed stats. Stats can only be reported outside @@ -3842,7 +4071,6 @@ send_feedback(XLogRecPtr recvpos, bool force, bool requestReply) static XLogRecPtr last_recvpos = InvalidXLogRecPtr; static XLogRecPtr last_writepos = InvalidXLogRecPtr; - static XLogRecPtr last_flushpos = InvalidXLogRecPtr; XLogRecPtr writepos; XLogRecPtr flushpos; @@ -3903,7 +4131,7 @@ send_feedback(XLogRecPtr recvpos, bool force, bool requestReply) pq_sendint64(reply_message, now); /* sendTime */ pq_sendbyte(reply_message, requestReply); /* replyRequested */ - elog(DEBUG2, "sending feedback (force %d) to recv %X/%X, write %X/%X, flush %X/%X", + elog(DEBUG2, "sending feedback (force %d) to recv %X/%08X, write %X/%08X, flush %X/%08X", force, LSN_FORMAT_ARGS(recvpos), LSN_FORMAT_ARGS(writepos), @@ -3920,6 +4148,367 @@ send_feedback(XLogRecPtr recvpos, bool force, bool requestReply) last_flushpos = flushpos; } +/* + * Attempt to advance the non-removable transaction ID. + * + * See comments atop worker.c for details. + */ +static void +maybe_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data, + bool status_received) +{ + if (!can_advance_nonremovable_xid(rdt_data)) + return; + + process_rdt_phase_transition(rdt_data, status_received); +} + +/* + * Preliminary check to determine if advancing the non-removable transaction ID + * is allowed. + */ +static bool +can_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data) +{ + /* + * It is sufficient to manage non-removable transaction ID for a + * subscription by the main apply worker to detect conflicts reliably even + * for table sync or parallel apply workers. + */ + if (!am_leader_apply_worker()) + return false; + + /* No need to advance if retaining dead tuples is not required */ + if (!MySubscription->retaindeadtuples) + return false; + + return true; +} + +/* + * Process phase transitions during the non-removable transaction ID + * advancement. See comments atop worker.c for details of the transition. + */ +static void +process_rdt_phase_transition(RetainDeadTuplesData *rdt_data, + bool status_received) +{ + switch (rdt_data->phase) + { + case RDT_GET_CANDIDATE_XID: + get_candidate_xid(rdt_data); + break; + case RDT_REQUEST_PUBLISHER_STATUS: + request_publisher_status(rdt_data); + break; + case RDT_WAIT_FOR_PUBLISHER_STATUS: + wait_for_publisher_status(rdt_data, status_received); + break; + case RDT_WAIT_FOR_LOCAL_FLUSH: + wait_for_local_flush(rdt_data); + break; + } +} + +/* + * Workhorse for the RDT_GET_CANDIDATE_XID phase. + */ +static void +get_candidate_xid(RetainDeadTuplesData *rdt_data) +{ + TransactionId oldest_running_xid; + TimestampTz now; + + /* + * Use last_recv_time when applying changes in the loop to avoid + * unnecessary system time retrieval. If last_recv_time is not available, + * obtain the current timestamp. + */ + now = rdt_data->last_recv_time ? rdt_data->last_recv_time : GetCurrentTimestamp(); + + /* + * Compute the candidate_xid and request the publisher status at most once + * per xid_advance_interval. Refer to adjust_xid_advance_interval() for + * details on how this value is dynamically adjusted. This is to avoid + * using CPU and network resources without making much progress. + */ + if (!TimestampDifferenceExceeds(rdt_data->candidate_xid_time, now, + rdt_data->xid_advance_interval)) + return; + + /* + * Immediately update the timer, even if the function returns later + * without setting candidate_xid due to inactivity on the subscriber. This + * avoids frequent calls to GetOldestActiveTransactionId. + */ + rdt_data->candidate_xid_time = now; + + /* + * Consider transactions in the current database, as only dead tuples from + * this database are required for conflict detection. + */ + oldest_running_xid = GetOldestActiveTransactionId(false, false); + + /* + * Oldest active transaction ID (oldest_running_xid) can't be behind any + * of its previously computed value. + */ + Assert(TransactionIdPrecedesOrEquals(MyLogicalRepWorker->oldest_nonremovable_xid, + oldest_running_xid)); + + /* Return if the oldest_nonremovable_xid cannot be advanced */ + if (TransactionIdEquals(MyLogicalRepWorker->oldest_nonremovable_xid, + oldest_running_xid)) + { + adjust_xid_advance_interval(rdt_data, false); + return; + } + + adjust_xid_advance_interval(rdt_data, true); + + rdt_data->candidate_xid = oldest_running_xid; + rdt_data->phase = RDT_REQUEST_PUBLISHER_STATUS; + + /* process the next phase */ + process_rdt_phase_transition(rdt_data, false); +} + +/* + * Workhorse for the RDT_REQUEST_PUBLISHER_STATUS phase. + */ +static void +request_publisher_status(RetainDeadTuplesData *rdt_data) +{ + static StringInfo request_message = NULL; + + if (!request_message) + { + MemoryContext oldctx = MemoryContextSwitchTo(ApplyContext); + + request_message = makeStringInfo(); + MemoryContextSwitchTo(oldctx); + } + else + resetStringInfo(request_message); + + /* + * Send the current time to update the remote walsender's latest reply + * message received time. + */ + pq_sendbyte(request_message, 'p'); + pq_sendint64(request_message, GetCurrentTimestamp()); + + elog(DEBUG2, "sending publisher status request message"); + + /* Send a request for the publisher status */ + walrcv_send(LogRepWorkerWalRcvConn, + request_message->data, request_message->len); + + rdt_data->phase = RDT_WAIT_FOR_PUBLISHER_STATUS; + + /* + * Skip calling maybe_advance_nonremovable_xid() since further transition + * is possible only once we receive the publisher status message. + */ +} + +/* + * Workhorse for the RDT_WAIT_FOR_PUBLISHER_STATUS phase. + */ +static void +wait_for_publisher_status(RetainDeadTuplesData *rdt_data, + bool status_received) +{ + /* + * Return if we have requested but not yet received the publisher status. + */ + if (!status_received) + return; + + if (!FullTransactionIdIsValid(rdt_data->remote_wait_for)) + rdt_data->remote_wait_for = rdt_data->remote_nextxid; + + /* + * Check if all remote concurrent transactions that were active at the + * first status request have now completed. If completed, proceed to the + * next phase; otherwise, continue checking the publisher status until + * these transactions finish. + * + * It's possible that transactions in the commit phase during the last + * cycle have now finished committing, but remote_oldestxid remains older + * than remote_wait_for. This can happen if some old transaction came in + * the commit phase when we requested status in this cycle. We do not + * handle this case explicitly as it's rare and the benefit doesn't + * justify the required complexity. Tracking would require either caching + * all xids at the publisher or sending them to subscribers. The condition + * will resolve naturally once the remaining transactions are finished. + * + * Directly advancing the non-removable transaction ID is possible if + * there are no activities on the publisher since the last advancement + * cycle. However, it requires maintaining two fields, last_remote_nextxid + * and last_remote_lsn, within the structure for comparison with the + * current cycle's values. Considering the minimal cost of continuing in + * RDT_WAIT_FOR_LOCAL_FLUSH without awaiting changes, we opted not to + * advance the transaction ID here. + */ + if (FullTransactionIdPrecedesOrEquals(rdt_data->remote_wait_for, + rdt_data->remote_oldestxid)) + rdt_data->phase = RDT_WAIT_FOR_LOCAL_FLUSH; + else + rdt_data->phase = RDT_REQUEST_PUBLISHER_STATUS; + + /* process the next phase */ + process_rdt_phase_transition(rdt_data, false); +} + +/* + * Workhorse for the RDT_WAIT_FOR_LOCAL_FLUSH phase. + */ +static void +wait_for_local_flush(RetainDeadTuplesData *rdt_data) +{ + Assert(!XLogRecPtrIsInvalid(rdt_data->remote_lsn) && + TransactionIdIsValid(rdt_data->candidate_xid)); + + /* + * We expect the publisher and subscriber clocks to be in sync using time + * sync service like NTP. Otherwise, we will advance this worker's + * oldest_nonremovable_xid prematurely, leading to the removal of rows + * required to detect conflicts reliably. This check primarily addresses + * scenarios where the publisher's clock falls behind; if the publisher's + * clock is ahead, subsequent transactions will naturally bear later + * commit timestamps, conforming to the design outlined atop worker.c. + * + * XXX Consider waiting for the publisher's clock to catch up with the + * subscriber's before proceeding to the next phase. + */ + if (TimestampDifferenceExceeds(rdt_data->reply_time, + rdt_data->candidate_xid_time, 0)) + ereport(ERROR, + errmsg_internal("oldest_nonremovable_xid transaction ID could be advanced prematurely"), + errdetail_internal("The clock on the publisher is behind that of the subscriber.")); + + /* + * Do not attempt to advance the non-removable transaction ID when table + * sync is in progress. During this time, changes from a single + * transaction may be applied by multiple table sync workers corresponding + * to the target tables. So, it's necessary for all table sync workers to + * apply and flush the corresponding changes before advancing the + * transaction ID, otherwise, dead tuples that are still needed for + * conflict detection in table sync workers could be removed prematurely. + * However, confirming the apply and flush progress across all table sync + * workers is complex and not worth the effort, so we simply return if not + * all tables are in the READY state. + * + * It is safe to add new tables with initial states to the subscription + * after this check because any changes applied to these tables should + * have a WAL position greater than the rdt_data->remote_lsn. + */ + if (!AllTablesyncsReady()) + return; + + /* + * Update and check the remote flush position if we are applying changes + * in a loop. This is done at most once per WalWriterDelay to avoid + * performing costly operations in get_flush_position() too frequently + * during change application. + */ + if (last_flushpos < rdt_data->remote_lsn && rdt_data->last_recv_time && + TimestampDifferenceExceeds(rdt_data->flushpos_update_time, + rdt_data->last_recv_time, WalWriterDelay)) + { + XLogRecPtr writepos; + XLogRecPtr flushpos; + bool have_pending_txes; + + /* Fetch the latest remote flush position */ + get_flush_position(&writepos, &flushpos, &have_pending_txes); + + if (flushpos > last_flushpos) + last_flushpos = flushpos; + + rdt_data->flushpos_update_time = rdt_data->last_recv_time; + } + + /* Return to wait for the changes to be applied */ + if (last_flushpos < rdt_data->remote_lsn) + return; + + /* + * Reaching here means the remote WAL position has been received, and all + * transactions up to that position on the publisher have been applied and + * flushed locally. So, we can advance the non-removable transaction ID. + */ + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->oldest_nonremovable_xid = rdt_data->candidate_xid; + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + elog(DEBUG2, "confirmed flush up to remote lsn %X/%X: new oldest_nonremovable_xid %u", + LSN_FORMAT_ARGS(rdt_data->remote_lsn), + rdt_data->candidate_xid); + + /* Notify launcher to update the xmin of the conflict slot */ + ApplyLauncherWakeup(); + + /* + * Reset all data fields except those used to determine the timing for the + * next round of transaction ID advancement. We can even use + * flushpos_update_time in the next round to decide whether to get the + * latest flush position. + */ + rdt_data->phase = RDT_GET_CANDIDATE_XID; + rdt_data->remote_lsn = InvalidXLogRecPtr; + rdt_data->remote_oldestxid = InvalidFullTransactionId; + rdt_data->remote_nextxid = InvalidFullTransactionId; + rdt_data->reply_time = 0; + rdt_data->remote_wait_for = InvalidFullTransactionId; + rdt_data->candidate_xid = InvalidTransactionId; + + /* process the next phase */ + process_rdt_phase_transition(rdt_data, false); +} + +/* + * Adjust the interval for advancing non-removable transaction IDs. + * + * We double the interval to try advancing the non-removable transaction IDs + * if there is no activity on the node. The maximum value of the interval is + * capped by wal_receiver_status_interval if it is not zero, otherwise to a + * 3 minutes which should be sufficient to avoid using CPU or network + * resources without much benefit. + * + * The interval is reset to a minimum value of 100ms once there is some + * activity on the node. + * + * XXX The use of wal_receiver_status_interval is a bit arbitrary so we can + * consider the other interval or a separate GUC if the need arises. + */ +static void +adjust_xid_advance_interval(RetainDeadTuplesData *rdt_data, bool new_xid_found) +{ + if (!new_xid_found && rdt_data->xid_advance_interval) + { + int max_interval = wal_receiver_status_interval + ? wal_receiver_status_interval * 1000 + : MAX_XID_ADVANCE_INTERVAL; + + /* + * No new transaction ID has been assigned since the last check, so + * double the interval, but not beyond the maximum allowable value. + */ + rdt_data->xid_advance_interval = Min(rdt_data->xid_advance_interval * 2, + max_interval); + } + else + { + /* + * A new transaction ID was found or the interval is not yet + * initialized, so set the interval to the minimum value. + */ + rdt_data->xid_advance_interval = MIN_XID_ADVANCE_INTERVAL; + } +} + /* * Exit routine for apply workers due to subscription parameter changes. */ @@ -4708,6 +5297,30 @@ InitializeLogRepWorker(void) apply_worker_exit(); } + /* + * Restart the worker if retain_dead_tuples was enabled during startup. + * + * At this point, the replication slot used for conflict detection might + * not exist yet, or could be dropped soon if the launcher perceives + * retain_dead_tuples as disabled. To avoid unnecessary tracking of + * oldest_nonremovable_xid when the slot is absent or at risk of being + * dropped, a restart is initiated. + * + * The oldest_nonremovable_xid should be initialized only when the + * retain_dead_tuples is enabled before launching the worker. See + * logicalrep_worker_launch. + */ + if (am_leader_apply_worker() && + MySubscription->retaindeadtuples && + !TransactionIdIsValid(MyLogicalRepWorker->oldest_nonremovable_xid)) + { + ereport(LOG, + errmsg("logical replication worker for subscription \"%s\" will restart because the option %s was enabled during startup", + MySubscription->name, "retain_dead_tuples")); + + apply_worker_exit(); + } + /* Setup synchronous commit according to the user's wishes */ SetConfigOption("synchronous_commit", MySubscription->synccommit, PGC_BACKEND, PGC_S_OVERRIDE); @@ -4864,6 +5477,14 @@ DisableSubscriptionAndExit(void) errmsg("subscription \"%s\" has been disabled because of an error", MySubscription->name)); + /* + * Skip the track_commit_timestamp check when disabling the worker due to + * an error, as verifying commit timestamps is unnecessary in this + * context. + */ + if (MySubscription->retaindeadtuples) + CheckSubDeadTupleRetention(false, true, WARNING); + proc_exit(0); } @@ -4909,7 +5530,7 @@ maybe_start_skipping_changes(XLogRecPtr finish_lsn) skip_xact_finish_lsn = finish_lsn; ereport(LOG, - errmsg("logical replication starts skipping transaction at LSN %X/%X", + errmsg("logical replication starts skipping transaction at LSN %X/%08X", LSN_FORMAT_ARGS(skip_xact_finish_lsn))); } @@ -4923,8 +5544,8 @@ stop_skipping_changes(void) return; ereport(LOG, - (errmsg("logical replication completed skipping transaction at LSN %X/%X", - LSN_FORMAT_ARGS(skip_xact_finish_lsn)))); + errmsg("logical replication completed skipping transaction at LSN %X/%08X", + LSN_FORMAT_ARGS(skip_xact_finish_lsn))); /* Stop skipping changes */ skip_xact_finish_lsn = InvalidXLogRecPtr; @@ -5012,7 +5633,7 @@ clear_subscription_skip_lsn(XLogRecPtr finish_lsn) if (myskiplsn != finish_lsn) ereport(WARNING, errmsg("skip-LSN of subscription \"%s\" cleared", MySubscription->name), - errdetail("Remote transaction's finish WAL location (LSN) %X/%X did not match skip-LSN %X/%X.", + errdetail("Remote transaction's finish WAL location (LSN) %X/%08X did not match skip-LSN %X/%08X.", LSN_FORMAT_ARGS(finish_lsn), LSN_FORMAT_ARGS(myskiplsn))); } @@ -5049,7 +5670,7 @@ apply_error_callback(void *arg) logicalrep_message_type(errarg->command), errarg->remote_xid); else - errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" in transaction %u, finished at %X/%X", + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" in transaction %u, finished at %X/%08X", errarg->origin_name, logicalrep_message_type(errarg->command), errarg->remote_xid, @@ -5067,7 +5688,7 @@ apply_error_callback(void *arg) errarg->rel->remoterel.relname, errarg->remote_xid); else - errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" in transaction %u, finished at %X/%X", + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" in transaction %u, finished at %X/%08X", errarg->origin_name, logicalrep_message_type(errarg->command), errarg->rel->remoterel.nspname, @@ -5086,7 +5707,7 @@ apply_error_callback(void *arg) errarg->rel->remoterel.attnames[errarg->remote_attnum], errarg->remote_xid); else - errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" column \"%s\" in transaction %u, finished at %X/%X", + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" column \"%s\" in transaction %u, finished at %X/%08X", errarg->origin_name, logicalrep_message_type(errarg->command), errarg->rel->remoterel.nspname, diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c index 082b4d9d32798..f4c977262c5a4 100644 --- a/src/backend/replication/pgoutput/pgoutput.c +++ b/src/backend/replication/pgoutput/pgoutput.c @@ -297,10 +297,12 @@ parse_output_parameters(List *options, PGOutputData *data) bool two_phase_option_given = false; bool origin_option_given = false; + /* Initialize optional parameters to defaults */ data->binary = false; data->streaming = LOGICALREP_STREAM_OFF; data->messages = false; data->two_phase = false; + data->publish_no_origin = false; foreach(lc, options) { diff --git a/src/backend/replication/repl_gram.y b/src/backend/replication/repl_gram.y index 7440aae5a1a7e..8a649199ec69c 100644 --- a/src/backend/replication/repl_gram.y +++ b/src/backend/replication/repl_gram.y @@ -279,7 +279,7 @@ alter_replication_slot: ; /* - * START_REPLICATION [SLOT slot] [PHYSICAL] %X/%X [TIMELINE %u] + * START_REPLICATION [SLOT slot] [PHYSICAL] %X/%08X [TIMELINE %u] */ start_replication: K_START_REPLICATION opt_slot opt_physical RECPTR opt_timeline @@ -295,7 +295,7 @@ start_replication: } ; -/* START_REPLICATION SLOT slot LOGICAL %X/%X options */ +/* START_REPLICATION SLOT slot LOGICAL %X/%08X options */ start_logical_replication: K_START_REPLICATION K_SLOT IDENT K_LOGICAL RECPTR plugin_options { diff --git a/src/backend/replication/repl_scanner.l b/src/backend/replication/repl_scanner.l index 014ea8d25c6b7..b6930e2865953 100644 --- a/src/backend/replication/repl_scanner.l +++ b/src/backend/replication/repl_scanner.l @@ -155,7 +155,7 @@ UPLOAD_MANIFEST { return K_UPLOAD_MANIFEST; } {hexdigit}+\/{hexdigit}+ { uint32 hi, lo; - if (sscanf(yytext, "%X/%X", &hi, &lo) != 2) + if (sscanf(yytext, "%X/%08X", &hi, &lo) != 2) replication_yyerror(NULL, yyscanner, "invalid streaming start location"); yylval->recptr = ((uint64) hi) << 32 | lo; return RECPTR; diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index f9fec50ae883f..8605776ad8631 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -47,6 +47,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "postmaster/interrupt.h" +#include "replication/logicallauncher.h" #include "replication/slotsync.h" #include "replication/slot.h" #include "replication/walsender_private.h" @@ -154,7 +155,7 @@ int max_replication_slots = 10; /* the maximum number of replication * Invalidate replication slots that have remained idle longer than this * duration; '0' disables it. */ -int idle_replication_slot_timeout_mins = 0; +int idle_replication_slot_timeout_secs = 0; /* * This GUC lists streaming replication standby server slot names that @@ -172,6 +173,7 @@ static SyncStandbySlotsConfigData *synchronized_standby_slots_config; static XLogRecPtr ss_oldest_flush_lsn = InvalidXLogRecPtr; static void ReplicationSlotShmemExit(int code, Datum arg); +static bool IsSlotForConflictCheck(const char *name); static void ReplicationSlotDropPtr(ReplicationSlot *slot); /* internal persistency functions */ @@ -258,13 +260,17 @@ ReplicationSlotShmemExit(int code, Datum arg) /* * Check whether the passed slot name is valid and report errors at elevel. * + * An error will be reported for a reserved replication slot name if + * allow_reserved_name is set to false. + * * Slot names may consist out of [a-z0-9_]{1,NAMEDATALEN-1} which should allow * the name to be used as a directory name on every supported OS. * * Returns whether the directory name is valid or not if elevel < ERROR. */ bool -ReplicationSlotValidateName(const char *name, int elevel) +ReplicationSlotValidateName(const char *name, bool allow_reserved_name, + int elevel) { const char *cp; @@ -300,9 +306,31 @@ ReplicationSlotValidateName(const char *name, int elevel) return false; } } + + if (!allow_reserved_name && IsSlotForConflictCheck(name)) + { + ereport(elevel, + errcode(ERRCODE_RESERVED_NAME), + errmsg("replication slot name \"%s\" is reserved", + name), + errdetail("The name \"%s\" is reserved for the conflict detection slot.", + CONFLICT_DETECTION_SLOT)); + + return false; + } + return true; } +/* + * Return true if the replication slot name is "pg_conflict_detection". + */ +static bool +IsSlotForConflictCheck(const char *name) +{ + return (strcmp(name, CONFLICT_DETECTION_SLOT) == 0); +} + /* * Create a new replication slot and mark it as used by this backend. * @@ -330,7 +358,12 @@ ReplicationSlotCreate(const char *name, bool db_specific, Assert(MyReplicationSlot == NULL); - ReplicationSlotValidateName(name, ERROR); + /* + * The logical launcher or pg_upgrade may create or migrate an internal + * slot, so using a reserved name is allowed in these cases. + */ + ReplicationSlotValidateName(name, IsBinaryUpgrade || IsLogicalLauncher(), + ERROR); if (failover) { @@ -581,6 +614,17 @@ ReplicationSlotAcquire(const char *name, bool nowait, bool error_if_invalid) name))); } + /* + * Do not allow users to acquire the reserved slot. This scenario may + * occur if the launcher that owns the slot has terminated unexpectedly + * due to an error, and a backend process attempts to reuse the slot. + */ + if (!IsLogicalLauncher() && IsSlotForConflictCheck(name)) + ereport(ERROR, + errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("cannot acquire replication slot \"%s\"", name), + errdetail("The slot is reserved for conflict detection and can only be acquired by logical replication launcher.")); + /* * This is the slot we want; check if it's active under some other * process. In single user mode, we don't need this check. @@ -1591,8 +1635,8 @@ ReportSlotInvalidation(ReplicationSlotInvalidationCause cause, uint64 ex = oldestLSN - restart_lsn; appendStringInfo(&err_detail, - ngettext("The slot's restart_lsn %X/%X exceeds the limit by %" PRIu64 " byte.", - "The slot's restart_lsn %X/%X exceeds the limit by %" PRIu64 " bytes.", + ngettext("The slot's restart_lsn %X/%08X exceeds the limit by %" PRIu64 " byte.", + "The slot's restart_lsn %X/%08X exceeds the limit by %" PRIu64 " bytes.", ex), LSN_FORMAT_ARGS(restart_lsn), ex); @@ -1612,13 +1656,10 @@ ReportSlotInvalidation(ReplicationSlotInvalidationCause cause, case RS_INVAL_IDLE_TIMEOUT: { - int minutes = slot_idle_seconds / SECS_PER_MINUTE; - int secs = slot_idle_seconds % SECS_PER_MINUTE; - /* translator: %s is a GUC variable name */ - appendStringInfo(&err_detail, _("The slot's idle time of %dmin %02ds exceeds the configured \"%s\" duration of %dmin."), - minutes, secs, "idle_replication_slot_timeout", - idle_replication_slot_timeout_mins); + appendStringInfo(&err_detail, _("The slot's idle time of %lds exceeds the configured \"%s\" duration of %ds."), + slot_idle_seconds, "idle_replication_slot_timeout", + idle_replication_slot_timeout_secs); /* translator: %s is a GUC variable name */ appendStringInfo(&err_hint, _("You might need to increase \"%s\"."), "idle_replication_slot_timeout"); @@ -1656,7 +1697,7 @@ ReportSlotInvalidation(ReplicationSlotInvalidationCause cause, static inline bool CanInvalidateIdleSlot(ReplicationSlot *s) { - return (idle_replication_slot_timeout_mins != 0 && + return (idle_replication_slot_timeout_secs != 0 && !XLogRecPtrIsInvalid(s->data.restart_lsn) && s->inactive_since > 0 && !(RecoveryInProgress() && s->data.synced)); @@ -1717,9 +1758,9 @@ DetermineSlotInvalidationCause(uint32 possible_causes, ReplicationSlot *s, if (CanInvalidateIdleSlot(s)) { /* - * We simulate the invalidation due to idle_timeout as the minimum - * time idle time is one minute which makes tests take a long - * time. + * Simulate the invalidation due to idle_timeout to test the + * timeout behavior promptly, without waiting for it to trigger + * naturally. */ #ifdef USE_INJECTION_POINTS if (IS_INJECTION_POINT_ATTACHED("slot-timeout-inval")) @@ -1734,7 +1775,7 @@ DetermineSlotInvalidationCause(uint32 possible_causes, ReplicationSlot *s, * idle_replication_slot_timeout GUC. */ if (TimestampDifferenceExceedsSeconds(s->inactive_since, now, - idle_replication_slot_timeout_mins * SECS_PER_MINUTE)) + idle_replication_slot_timeout_secs)) { *inactive_since = s->inactive_since; return RS_INVAL_IDLE_TIMEOUT; @@ -1890,15 +1931,6 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, SpinLockRelease(&s->mutex); - /* - * The logical replication slots shouldn't be invalidated as GUC - * max_slot_wal_keep_size is set to -1 and - * idle_replication_slot_timeout is set to 0 during the binary - * upgrade. See check_old_cluster_for_valid_slots() where we ensure - * that no invalidated before the upgrade. - */ - Assert(!(*invalidated && SlotIsLogical(s) && IsBinaryUpgrade)); - /* * Calculate the idle time duration of the slot if slot is marked * invalidated with RS_INVAL_IDLE_TIMEOUT. @@ -2045,6 +2077,10 @@ InvalidateObsoleteReplicationSlots(uint32 possible_causes, if (!s->in_use) continue; + /* Prevent invalidation of logical slots during binary upgrade */ + if (SlotIsLogical(s) && IsBinaryUpgrade) + continue; + if (InvalidatePossiblyObsoleteSlot(possible_causes, s, oldestLSN, dboid, snapshotConflictHorizon, &invalidated)) @@ -3057,22 +3093,3 @@ WaitForStandbyConfirmation(XLogRecPtr wait_for_lsn) ConditionVariableCancelSleep(); } - -/* - * GUC check_hook for idle_replication_slot_timeout - * - * The value of idle_replication_slot_timeout must be set to 0 during - * a binary upgrade. See start_postmaster() in pg_upgrade for more details. - */ -bool -check_idle_replication_slot_timeout(int *newval, void **extra, GucSource source) -{ - if (IsBinaryUpgrade && *newval != 0) - { - GUC_check_errdetail("\"%s\" must be set to 0 during binary upgrade mode.", - "idle_replication_slot_timeout"); - return false; - } - - return true; -} diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c index 36cc2ed4e440f..69f4c6157c518 100644 --- a/src/backend/replication/slotfuncs.c +++ b/src/backend/replication/slotfuncs.c @@ -566,7 +566,7 @@ pg_replication_slot_advance(PG_FUNCTION_ARGS) if (moveto < minlsn) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("cannot advance replication slot to %X/%X, minimum is %X/%X", + errmsg("cannot advance replication slot to %X/%08X, minimum is %X/%08X", LSN_FORMAT_ARGS(moveto), LSN_FORMAT_ARGS(minlsn)))); /* Do the actual slot update, depending on the slot type */ diff --git a/src/backend/replication/syncrep.c b/src/backend/replication/syncrep.c index cc35984ad0085..32cf3a48b897d 100644 --- a/src/backend/replication/syncrep.c +++ b/src/backend/replication/syncrep.c @@ -258,7 +258,7 @@ SyncRepWaitForLSN(XLogRecPtr lsn, bool commit) { char buffer[32]; - sprintf(buffer, "waiting for %X/%X", LSN_FORMAT_ARGS(lsn)); + sprintf(buffer, "waiting for %X/%08X", LSN_FORMAT_ARGS(lsn)); set_ps_display_suffix(buffer); } @@ -566,7 +566,7 @@ SyncRepReleaseWaiters(void) LWLockRelease(SyncRepLock); - elog(DEBUG3, "released %d procs up to write %X/%X, %d procs up to flush %X/%X, %d procs up to apply %X/%X", + elog(DEBUG3, "released %d procs up to write %X/%08X, %d procs up to flush %X/%08X, %d procs up to apply %X/%08X", numwrite, LSN_FORMAT_ARGS(writePtr), numflush, LSN_FORMAT_ARGS(flushPtr), numapply, LSN_FORMAT_ARGS(applyPtr)); diff --git a/src/backend/replication/syncrep_scanner.l b/src/backend/replication/syncrep_scanner.l index 7dec1f869c745..02004d621e73d 100644 --- a/src/backend/replication/syncrep_scanner.l +++ b/src/backend/replication/syncrep_scanner.l @@ -157,17 +157,16 @@ syncrep_yyerror(SyncRepConfigData **syncrep_parse_result_p, char **syncrep_parse { struct yyguts_t *yyg = (struct yyguts_t *) yyscanner; /* needed for yytext * macro */ - char *syncrep_parse_error_msg = *syncrep_parse_error_msg_p; /* report only the first error in a parse operation */ - if (syncrep_parse_error_msg) + if (*syncrep_parse_error_msg_p) return; if (yytext[0]) - syncrep_parse_error_msg = psprintf("%s at or near \"%s\"", - message, yytext); + *syncrep_parse_error_msg_p = psprintf("%s at or near \"%s\"", + message, yytext); else - syncrep_parse_error_msg = psprintf("%s at end of input", - message); + *syncrep_parse_error_msg_p = psprintf("%s at end of input", + message); } void diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index 8c4d0fd9aed2b..b62811017116f 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -386,12 +386,12 @@ WalReceiverMain(const void *startup_data, size_t startup_data_len) { if (first_stream) ereport(LOG, - (errmsg("started streaming WAL from primary at %X/%X on timeline %u", - LSN_FORMAT_ARGS(startpoint), startpointTLI))); + errmsg("started streaming WAL from primary at %X/%08X on timeline %u", + LSN_FORMAT_ARGS(startpoint), startpointTLI)); else ereport(LOG, - (errmsg("restarted WAL streaming at %X/%X on timeline %u", - LSN_FORMAT_ARGS(startpoint), startpointTLI))); + errmsg("restarted WAL streaming at %X/%08X on timeline %u", + LSN_FORMAT_ARGS(startpoint), startpointTLI)); first_stream = false; /* Initialize LogstreamResult and buffers for processing messages */ @@ -470,7 +470,7 @@ WalReceiverMain(const void *startup_data, size_t startup_data_len) { ereport(LOG, (errmsg("replication terminated by primary server"), - errdetail("End of WAL reached on timeline %u at %X/%X.", + errdetail("End of WAL reached on timeline %u at %X/%08X.", startpointTLI, LSN_FORMAT_ARGS(LogstreamResult.Write)))); endofwal = true; @@ -711,7 +711,7 @@ WalRcvWaitForStartPosition(XLogRecPtr *startpoint, TimeLineID *startpointTLI) { char activitymsg[50]; - snprintf(activitymsg, sizeof(activitymsg), "restarting at %X/%X", + snprintf(activitymsg, sizeof(activitymsg), "restarting at %X/%08X", LSN_FORMAT_ARGS(*startpoint)); set_ps_display(activitymsg); } @@ -1014,7 +1014,7 @@ XLogWalRcvFlush(bool dying, TimeLineID tli) { char activitymsg[50]; - snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", + snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%08X", LSN_FORMAT_ARGS(LogstreamResult.Write)); set_ps_display(activitymsg); } @@ -1138,7 +1138,7 @@ XLogWalRcvSendReply(bool force, bool requestReply) pq_sendbyte(&reply_message, requestReply ? 1 : 0); /* Send it */ - elog(DEBUG2, "sending write %X/%X flush %X/%X apply %X/%X%s", + elog(DEBUG2, "sending write %X/%08X flush %X/%08X apply %X/%08X%s", LSN_FORMAT_ARGS(writePtr), LSN_FORMAT_ARGS(flushPtr), LSN_FORMAT_ARGS(applyPtr), diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index f2c33250e8b2f..ee911394a23c6 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -65,6 +65,7 @@ #include "funcapi.h" #include "libpq/libpq.h" #include "libpq/pqformat.h" +#include "libpq/protocol.h" #include "miscadmin.h" #include "nodes/replnodes.h" #include "pgstat.h" @@ -84,6 +85,7 @@ #include "storage/ipc.h" #include "storage/pmsignal.h" #include "storage/proc.h" +#include "storage/procarray.h" #include "tcop/dest.h" #include "tcop/tcopprot.h" #include "utils/acl.h" @@ -258,6 +260,7 @@ static void StartLogicalReplication(StartReplicationCmd *cmd); static void ProcessStandbyMessage(void); static void ProcessStandbyReplyMessage(void); static void ProcessStandbyHSFeedbackMessage(void); +static void ProcessStandbyPSRequestMessage(void); static void ProcessRepliesIfAny(void); static void ProcessPendingWrites(void); static void WalSndKeepalive(bool requestReply, XLogRecPtr writePtr); @@ -408,7 +411,7 @@ IdentifySystem(void) else logptr = GetFlushRecPtr(&currTLI); - snprintf(xloc, sizeof(xloc), "%X/%X", LSN_FORMAT_ARGS(logptr)); + snprintf(xloc, sizeof(xloc), "%X/%08X", LSN_FORMAT_ARGS(logptr)); if (MyDatabaseId != InvalidOid) { @@ -515,7 +518,7 @@ ReadReplicationSlot(ReadReplicationSlotCmd *cmd) { char xloc[64]; - snprintf(xloc, sizeof(xloc), "%X/%X", + snprintf(xloc, sizeof(xloc), "%X/%08X", LSN_FORMAT_ARGS(slot_contents.data.restart_lsn)); values[i] = CStringGetTextDatum(xloc); nulls[i] = false; @@ -733,13 +736,13 @@ HandleUploadManifestPacket(StringInfo buf, off_t *offset, switch (mtype) { - case 'd': /* CopyData */ + case PqMsg_CopyData: maxmsglen = PQ_LARGE_MESSAGE_LIMIT; break; - case 'c': /* CopyDone */ - case 'f': /* CopyFail */ - case 'H': /* Flush */ - case 'S': /* Sync */ + case PqMsg_CopyDone: + case PqMsg_CopyFail: + case PqMsg_Flush: + case PqMsg_Sync: maxmsglen = PQ_SMALL_MESSAGE_LIMIT; break; default: @@ -761,19 +764,19 @@ HandleUploadManifestPacket(StringInfo buf, off_t *offset, /* Process the message */ switch (mtype) { - case 'd': /* CopyData */ + case PqMsg_CopyData: AppendIncrementalManifestData(ib, buf->data, buf->len); return true; - case 'c': /* CopyDone */ + case PqMsg_CopyDone: return false; - case 'H': /* Sync */ - case 'S': /* Flush */ + case PqMsg_Sync: + case PqMsg_Flush: /* Ignore these while in CopyOut mode as we do elsewhere. */ return true; - case 'f': + case PqMsg_CopyFail: ereport(ERROR, (errcode(ERRCODE_QUERY_CANCELED), errmsg("COPY from stdin failed: %s", @@ -892,12 +895,12 @@ StartReplication(StartReplicationCmd *cmd) switchpoint < cmd->startpoint) { ereport(ERROR, - (errmsg("requested starting point %X/%X on timeline %u is not in this server's history", - LSN_FORMAT_ARGS(cmd->startpoint), - cmd->timeline), - errdetail("This server's history forked from timeline %u at %X/%X.", - cmd->timeline, - LSN_FORMAT_ARGS(switchpoint)))); + errmsg("requested starting point %X/%08X on timeline %u is not in this server's history", + LSN_FORMAT_ARGS(cmd->startpoint), + cmd->timeline), + errdetail("This server's history forked from timeline %u at %X/%08X.", + cmd->timeline, + LSN_FORMAT_ARGS(switchpoint))); } sendTimeLineValidUpto = switchpoint; } @@ -939,9 +942,9 @@ StartReplication(StartReplicationCmd *cmd) if (FlushPtr < cmd->startpoint) { ereport(ERROR, - (errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X", - LSN_FORMAT_ARGS(cmd->startpoint), - LSN_FORMAT_ARGS(FlushPtr)))); + errmsg("requested starting point %X/%08X is ahead of the WAL flush position of this server %X/%08X", + LSN_FORMAT_ARGS(cmd->startpoint), + LSN_FORMAT_ARGS(FlushPtr))); } /* Start streaming from the requested point */ @@ -983,7 +986,7 @@ StartReplication(StartReplicationCmd *cmd) Datum values[2]; bool nulls[2] = {0}; - snprintf(startpos_str, sizeof(startpos_str), "%X/%X", + snprintf(startpos_str, sizeof(startpos_str), "%X/%08X", LSN_FORMAT_ARGS(sendTimeLineValidUpto)); dest = CreateDestReceiver(DestRemoteSimple); @@ -1324,7 +1327,7 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd) ReplicationSlotPersist(); } - snprintf(xloc, sizeof(xloc), "%X/%X", + snprintf(xloc, sizeof(xloc), "%X/%08X", LSN_FORMAT_ARGS(MyReplicationSlot->data.confirmed_flush)); dest = CreateDestReceiver(DestRemoteSimple); @@ -1567,7 +1570,7 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, tmpbuf.data, sizeof(int64)); /* output previously gathered data in a CopyData packet */ - pq_putmessage_noblock('d', ctx->out->data, ctx->out->len); + pq_putmessage_noblock(PqMsg_CopyData, ctx->out->data, ctx->out->len); CHECK_FOR_INTERRUPTS(); @@ -2303,7 +2306,7 @@ ProcessRepliesIfAny(void) case PqMsg_CopyDone: if (!streamingDoneSending) { - pq_putmessage_noblock('c', NULL, 0); + pq_putmessage_noblock(PqMsg_CopyDone, NULL, 0); streamingDoneSending = true; } @@ -2355,6 +2358,10 @@ ProcessStandbyMessage(void) ProcessStandbyHSFeedbackMessage(); break; + case 'p': + ProcessStandbyPSRequestMessage(); + break; + default: ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), @@ -2429,7 +2436,7 @@ ProcessStandbyReplyMessage(void) /* Copy because timestamptz_to_str returns a static buffer */ replyTimeStr = pstrdup(timestamptz_to_str(replyTime)); - elog(DEBUG2, "write %X/%X flush %X/%X apply %X/%X%s reply_time %s", + elog(DEBUG2, "write %X/%08X flush %X/%08X apply %X/%08X%s reply_time %s", LSN_FORMAT_ARGS(writePtr), LSN_FORMAT_ARGS(flushPtr), LSN_FORMAT_ARGS(applyPtr), @@ -2701,6 +2708,60 @@ ProcessStandbyHSFeedbackMessage(void) } } +/* + * Process the request for a primary status update message. + */ +static void +ProcessStandbyPSRequestMessage(void) +{ + XLogRecPtr lsn = InvalidXLogRecPtr; + TransactionId oldestXidInCommit; + FullTransactionId nextFullXid; + FullTransactionId fullOldestXidInCommit; + WalSnd *walsnd = MyWalSnd; + TimestampTz replyTime; + + /* + * This shouldn't happen because we don't support getting primary status + * message from standby. + */ + if (RecoveryInProgress()) + elog(ERROR, "the primary status is unavailable during recovery"); + + replyTime = pq_getmsgint64(&reply_message); + + /* + * Update shared state for this WalSender process based on reply data from + * standby. + */ + SpinLockAcquire(&walsnd->mutex); + walsnd->replyTime = replyTime; + SpinLockRelease(&walsnd->mutex); + + /* + * Consider transactions in the current database, as only these are the + * ones replicated. + */ + oldestXidInCommit = GetOldestActiveTransactionId(true, false); + nextFullXid = ReadNextFullTransactionId(); + fullOldestXidInCommit = FullTransactionIdFromAllowableAt(nextFullXid, + oldestXidInCommit); + lsn = GetXLogWriteRecPtr(); + + elog(DEBUG2, "sending primary status"); + + /* construct the message... */ + resetStringInfo(&output_message); + pq_sendbyte(&output_message, 's'); + pq_sendint64(&output_message, lsn); + pq_sendint64(&output_message, (int64) U64FromFullTransactionId(fullOldestXidInCommit)); + pq_sendint64(&output_message, (int64) U64FromFullTransactionId(nextFullXid)); + pq_sendint64(&output_message, GetCurrentTimestamp()); + + /* ... and send it wrapped in CopyData */ + pq_putmessage_noblock(PqMsg_CopyData, output_message.data, output_message.len); +} + /* * Compute how long send/receive loops should sleep. * @@ -3246,12 +3307,12 @@ XLogSendPhysical(void) wal_segment_close(xlogreader); /* Send CopyDone */ - pq_putmessage_noblock('c', NULL, 0); + pq_putmessage_noblock(PqMsg_CopyDone, NULL, 0); streamingDoneSending = true; WalSndCaughtUp = true; - elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)", + elog(DEBUG1, "walsender reached end of timeline at %X/%08X (sent up to %X/%08X)", LSN_FORMAT_ARGS(sendTimeLineValidUpto), LSN_FORMAT_ARGS(sentPtr)); return; @@ -3374,7 +3435,7 @@ XLogSendPhysical(void) memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)], tmpbuf.data, sizeof(int64)); - pq_putmessage_noblock('d', output_message.data, output_message.len); + pq_putmessage_noblock(PqMsg_CopyData, output_message.data, output_message.len); sentPtr = endptr; @@ -3392,7 +3453,7 @@ XLogSendPhysical(void) { char activitymsg[50]; - snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", + snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%08X", LSN_FORMAT_ARGS(sentPtr)); set_ps_display(activitymsg); } @@ -4080,7 +4141,7 @@ WalSndKeepalive(bool requestReply, XLogRecPtr writePtr) pq_sendbyte(&output_message, requestReply ? 1 : 0); /* ... and send it wrapped in CopyData */ - pq_putmessage_noblock('d', output_message.data, output_message.len); + pq_putmessage_noblock(PqMsg_CopyData, output_message.data, output_message.len); /* Set local flag */ if (requestReply) diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index 2ef0e7fbf3a69..adc9e7600e1ed 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -923,8 +923,9 @@ rewriteTargetListIU(List *targetList, apply_default = true; /* - * Can only insert DEFAULT into generated columns, regardless of - * any OVERRIDING clauses. + * Can only insert DEFAULT into generated columns. (The + * OVERRIDING clause does not apply to generated columns, so we + * don't consider it here.) */ if (att_tup->attgenerated && !apply_default) { diff --git a/src/backend/storage/aio/README.md b/src/backend/storage/aio/README.md index f10b5c7e31ec7..72ae3b3737d51 100644 --- a/src/backend/storage/aio/README.md +++ b/src/backend/storage/aio/README.md @@ -94,7 +94,7 @@ pgaio_io_register_callbacks(ioh, PGAIO_HCB_SHARED_BUFFER_READV, 0); * * In this example we're reading only a single buffer, hence the 1. */ -pgaio_io_set_handle_data_32(ioh, (uint32 *) buffer, 1); +pgaio_io_set_handle_data_32(ioh, (uint32 *) &buffer, 1); /* * Pass the AIO handle to lower-level function. When operating on the level of @@ -119,8 +119,9 @@ pgaio_io_set_handle_data_32(ioh, (uint32 *) buffer, 1); * e.g. due to reaching a limit on the number of unsubmitted IOs, and even * complete before smgrstartreadv() returns. */ +void *page = BufferGetBlock(buffer); smgrstartreadv(ioh, operation->smgr, forknum, blkno, - BufferGetBlock(buffer), 1); + &page, 1); /* * To benefit from AIO, it is beneficial to perform other work, including diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c index b78048328e113..0a8c054162f06 100644 --- a/src/backend/storage/aio/method_io_uring.c +++ b/src/backend/storage/aio/method_io_uring.c @@ -29,6 +29,9 @@ #ifdef IOMETHOD_IO_URING_ENABLED +#include +#include + #include #include "miscadmin.h" @@ -94,12 +97,32 @@ PgAioUringContext struct io_uring io_uring_ring; } PgAioUringContext; +/* + * Information about the capabilities that io_uring has. + * + * Depending on liburing and kernel version different features are + * supported. At least for the kernel a kernel version check does not suffice + * as various vendors do backport features to older kernels :(. + */ +typedef struct PgAioUringCaps +{ + bool checked; + /* -1 if io_uring_queue_init_mem() is unsupported */ + int mem_init_size; +} PgAioUringCaps; + + /* PgAioUringContexts for all backends */ static PgAioUringContext *pgaio_uring_contexts; /* the current backend's context */ static PgAioUringContext *pgaio_my_uring_context; +static PgAioUringCaps pgaio_uring_caps = +{ + .checked = false, + .mem_init_size = -1, +}; static uint32 pgaio_uring_procs(void) @@ -111,16 +134,145 @@ pgaio_uring_procs(void) return MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS; } -static Size +/* + * Initializes pgaio_uring_caps, unless that's already done. + */ +static void +pgaio_uring_check_capabilities(void) +{ + if (pgaio_uring_caps.checked) + return; + + /* + * By default io_uring creates a shared memory mapping for each io_uring + * instance, leading to a large number of memory mappings. Unfortunately a + * large number of memory mappings slows things down, backend exit is + * particularly affected. To address that, newer kernels (6.5) support + * using user-provided memory for the memory, by putting the relevant + * memory into shared memory we don't need any additional mappings. + * + * To know whether this is supported, we unfortunately need to probe the + * kernel by trying to create a ring with userspace-provided memory. This + * also has a secondary benefit: We can determine precisely how much + * memory we need for each io_uring instance. + */ +#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP) + { + struct io_uring test_ring; + size_t ring_size; + void *ring_ptr; + struct io_uring_params p = {0}; + int ret; + + /* + * Liburing does not yet provide an API to query how much memory a + * ring will need. So we over-estimate it here. As the memory is freed + * just below that's small temporary waste of memory. + * + * 1MB is more than enough for rings within io_max_concurrency's + * range. + */ + ring_size = 1024 * 1024; + + /* + * Hard to believe a system exists where 1MB would not be a multiple + * of the page size. But it's cheap to ensure... + */ + ring_size -= ring_size % sysconf(_SC_PAGESIZE); + + ring_ptr = mmap(NULL, ring_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (ring_ptr == MAP_FAILED) + elog(ERROR, + "mmap(%zu) to determine io_uring_queue_init_mem() support failed: %m", + ring_size); + + ret = io_uring_queue_init_mem(io_max_concurrency, &test_ring, &p, ring_ptr, ring_size); + if (ret > 0) + { + pgaio_uring_caps.mem_init_size = ret; + + elog(DEBUG1, + "can use combined memory mapping for io_uring, each ring needs %d bytes", + ret); + + /* clean up the created ring, it was just for a test */ + io_uring_queue_exit(&test_ring); + } + else + { + /* + * There are different reasons for ring creation to fail, but it's + * ok to treat that just as io_uring_queue_init_mem() not being + * supported. We'll report a more detailed error in + * pgaio_uring_shmem_init(). + */ + errno = -ret; + elog(DEBUG1, + "cannot use combined memory mapping for io_uring, ring creation failed: %m"); + + } + + if (munmap(ring_ptr, ring_size) != 0) + elog(ERROR, "munmap() failed: %m"); + } +#else + { + elog(DEBUG1, + "can't use combined memory mapping for io_uring, kernel or liburing too old"); + } +#endif + + pgaio_uring_caps.checked = true; +} + +/* + * Memory for all PgAioUringContext instances + */ +static size_t pgaio_uring_context_shmem_size(void) { return mul_size(pgaio_uring_procs(), sizeof(PgAioUringContext)); } +/* + * Memory for the combined memory used by io_uring instances. Returns 0 if + * that is not supported by kernel/liburing. + */ +static size_t +pgaio_uring_ring_shmem_size(void) +{ + size_t sz = 0; + + if (pgaio_uring_caps.mem_init_size > 0) + { + /* + * Memory for rings needs to be allocated to the page boundary, + * reserve space. Luckily it does not need to be aligned to hugepage + * boundaries, even if huge pages are used. + */ + sz = add_size(sz, sysconf(_SC_PAGESIZE)); + sz = add_size(sz, mul_size(pgaio_uring_procs(), + pgaio_uring_caps.mem_init_size)); + } + + return sz; +} + static size_t pgaio_uring_shmem_size(void) { - return pgaio_uring_context_shmem_size(); + size_t sz; + + /* + * Kernel and liburing support for various features influences how much + * shmem we need, perform the necessary checks. + */ + pgaio_uring_check_capabilities(); + + sz = pgaio_uring_context_shmem_size(); + sz = add_size(sz, pgaio_uring_ring_shmem_size()); + + return sz; } static void @@ -128,13 +280,38 @@ pgaio_uring_shmem_init(bool first_time) { int TotalProcs = pgaio_uring_procs(); bool found; + char *shmem; + size_t ring_mem_remain = 0; + char *ring_mem_next = 0; - pgaio_uring_contexts = (PgAioUringContext *) - ShmemInitStruct("AioUring", pgaio_uring_shmem_size(), &found); - + /* + * We allocate memory for all PgAioUringContext instances and, if + * supported, the memory required for each of the io_uring instances, in + * one ShmemInitStruct(). + */ + shmem = ShmemInitStruct("AioUringContext", pgaio_uring_shmem_size(), &found); if (found) return; + pgaio_uring_contexts = (PgAioUringContext *) shmem; + shmem += pgaio_uring_context_shmem_size(); + + /* if supported, handle memory alignment / sizing for io_uring memory */ + if (pgaio_uring_caps.mem_init_size > 0) + { + ring_mem_remain = pgaio_uring_ring_shmem_size(); + ring_mem_next = (char *) shmem; + + /* align to page boundary, see also pgaio_uring_ring_shmem_size() */ + ring_mem_next = (char *) TYPEALIGN(sysconf(_SC_PAGESIZE), ring_mem_next); + + /* account for alignment */ + ring_mem_remain -= ring_mem_next - shmem; + shmem += ring_mem_next - shmem; + + shmem += ring_mem_remain; + } + for (int contextno = 0; contextno < TotalProcs; contextno++) { PgAioUringContext *context = &pgaio_uring_contexts[contextno]; @@ -158,7 +335,28 @@ pgaio_uring_shmem_init(bool first_time) * be worth using that - also need to evaluate if that causes * noticeable additional contention? */ - ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0); + + /* + * If supported (c.f. pgaio_uring_check_capabilities()), create ring + * with its data in shared memory. Otherwise fall back io_uring + * creating a memory mapping for each ring. + */ +#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP) + if (pgaio_uring_caps.mem_init_size > 0) + { + struct io_uring_params p = {0}; + + ret = io_uring_queue_init_mem(io_max_concurrency, &context->io_uring_ring, &p, ring_mem_next, ring_mem_remain); + + ring_mem_remain -= ret; + ring_mem_next += ret; + } + else +#endif + { + ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0); + } + if (ret < 0) { char *hint = NULL; diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c index 36be179678d7a..bf8f77e6ff606 100644 --- a/src/backend/storage/aio/method_worker.c +++ b/src/backend/storage/aio/method_worker.c @@ -52,26 +52,26 @@ #define IO_WORKER_WAKEUP_FANOUT 2 -typedef struct AioWorkerSubmissionQueue +typedef struct PgAioWorkerSubmissionQueue { uint32 size; uint32 mask; uint32 head; uint32 tail; - uint32 ios[FLEXIBLE_ARRAY_MEMBER]; -} AioWorkerSubmissionQueue; + uint32 sqes[FLEXIBLE_ARRAY_MEMBER]; +} PgAioWorkerSubmissionQueue; -typedef struct AioWorkerSlot +typedef struct PgAioWorkerSlot { Latch *latch; bool in_use; -} AioWorkerSlot; +} PgAioWorkerSlot; -typedef struct AioWorkerControl +typedef struct PgAioWorkerControl { uint64 idle_worker_mask; - AioWorkerSlot workers[FLEXIBLE_ARRAY_MEMBER]; -} AioWorkerControl; + PgAioWorkerSlot workers[FLEXIBLE_ARRAY_MEMBER]; +} PgAioWorkerControl; static size_t pgaio_worker_shmem_size(void); @@ -96,8 +96,8 @@ int io_workers = 3; static int io_worker_queue_size = 64; static int MyIoWorkerId; -static AioWorkerSubmissionQueue *io_worker_submission_queue; -static AioWorkerControl *io_worker_control; +static PgAioWorkerSubmissionQueue *io_worker_submission_queue; +static PgAioWorkerControl *io_worker_control; static size_t @@ -106,15 +106,15 @@ pgaio_worker_queue_shmem_size(int *queue_size) /* Round size up to next power of two so we can make a mask. */ *queue_size = pg_nextpower2_32(io_worker_queue_size); - return offsetof(AioWorkerSubmissionQueue, ios) + + return offsetof(PgAioWorkerSubmissionQueue, sqes) + sizeof(uint32) * *queue_size; } static size_t pgaio_worker_control_shmem_size(void) { - return offsetof(AioWorkerControl, workers) + - sizeof(AioWorkerSlot) * MAX_IO_WORKERS; + return offsetof(PgAioWorkerControl, workers) + + sizeof(PgAioWorkerSlot) * MAX_IO_WORKERS; } static size_t @@ -162,7 +162,7 @@ pgaio_worker_shmem_init(bool first_time) } static int -pgaio_choose_idle_worker(void) +pgaio_worker_choose_idle(void) { int worker; @@ -172,6 +172,7 @@ pgaio_choose_idle_worker(void) /* Find the lowest bit position, and clear it. */ worker = pg_rightmost_one_pos64(io_worker_control->idle_worker_mask); io_worker_control->idle_worker_mask &= ~(UINT64_C(1) << worker); + Assert(io_worker_control->workers[worker].in_use); return worker; } @@ -179,7 +180,7 @@ pgaio_choose_idle_worker(void) static bool pgaio_worker_submission_queue_insert(PgAioHandle *ioh) { - AioWorkerSubmissionQueue *queue; + PgAioWorkerSubmissionQueue *queue; uint32 new_head; queue = io_worker_submission_queue; @@ -191,7 +192,7 @@ pgaio_worker_submission_queue_insert(PgAioHandle *ioh) return false; /* full */ } - queue->ios[queue->head] = pgaio_io_get_id(ioh); + queue->sqes[queue->head] = pgaio_io_get_id(ioh); queue->head = new_head; return true; @@ -200,14 +201,14 @@ pgaio_worker_submission_queue_insert(PgAioHandle *ioh) static uint32 pgaio_worker_submission_queue_consume(void) { - AioWorkerSubmissionQueue *queue; + PgAioWorkerSubmissionQueue *queue; uint32 result; queue = io_worker_submission_queue; if (queue->tail == queue->head) return UINT32_MAX; /* empty */ - result = queue->ios[queue->tail]; + result = queue->sqes[queue->tail]; queue->tail = (queue->tail + 1) & (queue->size - 1); return result; @@ -240,37 +241,37 @@ pgaio_worker_needs_synchronous_execution(PgAioHandle *ioh) } static void -pgaio_worker_submit_internal(int nios, PgAioHandle *ios[]) +pgaio_worker_submit_internal(int num_staged_ios, PgAioHandle **staged_ios) { PgAioHandle *synchronous_ios[PGAIO_SUBMIT_BATCH_SIZE]; int nsync = 0; Latch *wakeup = NULL; int worker; - Assert(nios <= PGAIO_SUBMIT_BATCH_SIZE); + Assert(num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE); LWLockAcquire(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE); - for (int i = 0; i < nios; ++i) + for (int i = 0; i < num_staged_ios; ++i) { - Assert(!pgaio_worker_needs_synchronous_execution(ios[i])); - if (!pgaio_worker_submission_queue_insert(ios[i])) + Assert(!pgaio_worker_needs_synchronous_execution(staged_ios[i])); + if (!pgaio_worker_submission_queue_insert(staged_ios[i])) { /* * We'll do it synchronously, but only after we've sent as many as * we can to workers, to maximize concurrency. */ - synchronous_ios[nsync++] = ios[i]; + synchronous_ios[nsync++] = staged_ios[i]; continue; } if (wakeup == NULL) { /* Choose an idle worker to wake up if we haven't already. */ - worker = pgaio_choose_idle_worker(); + worker = pgaio_worker_choose_idle(); if (worker >= 0) wakeup = io_worker_control->workers[worker].latch; - pgaio_debug_io(DEBUG4, ios[i], + pgaio_debug_io(DEBUG4, staged_ios[i], "choosing worker %d", worker); } @@ -316,6 +317,7 @@ pgaio_worker_die(int code, Datum arg) Assert(io_worker_control->workers[MyIoWorkerId].in_use); Assert(io_worker_control->workers[MyIoWorkerId].latch == MyLatch); + io_worker_control->idle_worker_mask &= ~(UINT64_C(1) << MyIoWorkerId); io_worker_control->workers[MyIoWorkerId].in_use = false; io_worker_control->workers[MyIoWorkerId].latch = NULL; LWLockRelease(AioWorkerSubmissionQueueLock); @@ -488,7 +490,7 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len) IO_WORKER_WAKEUP_FANOUT); for (int i = 0; i < nwakeups; ++i) { - if ((worker = pgaio_choose_idle_worker()) < 0) + if ((worker = pgaio_worker_choose_idle()) < 0) break; latches[nlatches++] = io_worker_control->workers[worker].latch; } @@ -573,6 +575,12 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len) } CHECK_FOR_INTERRUPTS(); + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } } error_context_stack = errcallback.previous; diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 667aa0c0c78d4..67431208e7f5f 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -2743,12 +2743,10 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, * because mdread doesn't complain about reads beyond EOF (when * zero_damaged_pages is ON) and so a previous attempt to read a block * beyond EOF could have left a "valid" zero-filled buffer. - * Unfortunately, we have also seen this case occurring because of - * buggy Linux kernels that sometimes return an lseek(SEEK_END) result - * that doesn't account for a recent write. In that situation, the - * pre-existing buffer would contain valid data that we don't want to - * overwrite. Since the legitimate cases should always have left a - * zero-filled buffer, complain if not PageIsNew. + * + * This has also been observed when relation was overwritten by + * external process. Since the legitimate cases should always have + * left a zero-filled buffer, complain if not PageIsNew. */ if (existing_id >= 0) { @@ -2778,8 +2776,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, ereport(ERROR, (errmsg("unexpected data beyond EOF in block %u of relation %s", existing_hdr->tag.blockNum, - relpath(bmr.smgr->smgr_rlocator, fork).str), - errhint("This has been seen to occur with buggy kernels; consider updating your system."))); + relpath(bmr.smgr->smgr_rlocator, fork).str))); /* * We *must* do smgr[zero]extend before succeeding, else the page @@ -3339,10 +3336,10 @@ UnpinBufferNoOwner(BufferDesc *buf) * BufferSync -- Write out all dirty buffers in the pool. * * This is called at checkpoint time to write out all dirty shared buffers. - * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE - * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN, - * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even - * unlogged buffers, which are otherwise skipped. The remaining flags + * The checkpoint request flags should be passed in. If CHECKPOINT_FAST is + * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN, + * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write + * even unlogged buffers, which are otherwise skipped. The remaining flags * currently have no effect here. */ static void @@ -3367,7 +3364,7 @@ BufferSync(int flags) * recovery, we write all dirty buffers. */ if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY | - CHECKPOINT_FLUSH_ALL)))) + CHECKPOINT_FLUSH_UNLOGGED)))) mask |= BM_PERMANENT; /* @@ -4550,11 +4547,9 @@ DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, if (RelFileLocatorBackendIsTemp(rlocator)) { if (rlocator.backend == MyProcNumber) - { - for (j = 0; j < nforks; j++) - DropRelationLocalBuffers(rlocator.locator, forkNum[j], - firstDelBlock[j]); - } + DropRelationLocalBuffers(rlocator.locator, forkNum, nforks, + firstDelBlock); + return; } diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index ba26627f7b00d..3da9c41ee1d7a 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -660,10 +660,11 @@ InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced) * See DropRelationBuffers in bufmgr.c for more notes. */ void -DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, - BlockNumber firstDelBlock) +DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber *forkNum, + int nforks, BlockNumber *firstDelBlock) { int i; + int j; for (i = 0; i < NLocBuffer; i++) { @@ -672,12 +673,18 @@ DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, buf_state = pg_atomic_read_u32(&bufHdr->state); - if ((buf_state & BM_TAG_VALID) && - BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) && - BufTagGetForkNum(&bufHdr->tag) == forkNum && - bufHdr->tag.blockNum >= firstDelBlock) + if (!(buf_state & BM_TAG_VALID) || + !BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator)) + continue; + + for (j = 0; j < nforks; j++) { - InvalidateLocalBuffer(bufHdr, true); + if (BufTagGetForkNum(&bufHdr->tag) == forkNum[j] && + bufHdr->tag.blockNum >= firstDelBlock[j]) + { + InvalidateLocalBuffer(bufHdr, true); + break; + } } } } diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 0e8299dd55646..a4ec7959f31cf 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -400,25 +400,22 @@ pg_fsync(int fd) * portable, even if it runs ok on the current system. * * We assert here that a descriptor for a file was opened with write - * permissions (either O_RDWR or O_WRONLY) and for a directory without - * write permissions (O_RDONLY). + * permissions (i.e., not O_RDONLY) and for a directory without write + * permissions (O_RDONLY). Notice that the assertion check is made even + * if fsync() is disabled. * - * Ignore any fstat errors and let the follow-up fsync() do its work. - * Doing this sanity check here counts for the case where fsync() is - * disabled. + * If fstat() fails, ignore it and let the follow-up fsync() complain. */ if (fstat(fd, &st) == 0) { int desc_flags = fcntl(fd, F_GETFL); - /* - * O_RDONLY is historically 0, so just make sure that for directories - * no write flags are used. - */ + desc_flags &= O_ACCMODE; + if (S_ISDIR(st.st_mode)) - Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0); + Assert(desc_flags == O_RDONLY); else - Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0); + Assert(desc_flags != O_RDONLY); } errno = 0; #endif diff --git a/src/backend/storage/ipc/dsm_registry.c b/src/backend/storage/ipc/dsm_registry.c index 1d4fd31ffedbc..1682cc6d34c7f 100644 --- a/src/backend/storage/ipc/dsm_registry.c +++ b/src/backend/storage/ipc/dsm_registry.c @@ -15,6 +15,20 @@ * current backend. This function guarantees that only one backend * initializes the segment and that all other backends just attach it. * + * A DSA can be created in or retrieved from the registry by calling + * GetNamedDSA(). As with GetNamedDSMSegment(), if a DSA with the provided + * name does not yet exist, it is created. Otherwise, GetNamedDSA() + * ensures the DSA is attached to the current backend. This function + * guarantees that only one backend initializes the DSA and that all other + * backends just attach it. + * + * A dshash table can be created in or retrieved from the registry by + * calling GetNamedDSHash(). As with GetNamedDSMSegment(), if a hash + * table with the provided name does not yet exist, it is created. + * Otherwise, GetNamedDSHash() ensures the hash table is attached to the + * current backend. This function guarantees that only one backend + * initializes the table and that all other backends just attach it. + * * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -26,12 +40,20 @@ #include "postgres.h" +#include "funcapi.h" #include "lib/dshash.h" #include "storage/dsm_registry.h" #include "storage/lwlock.h" #include "storage/shmem.h" +#include "utils/builtins.h" #include "utils/memutils.h" +#define DSMR_NAME_LEN 128 + +#define DSMR_DSA_TRANCHE_SUFFIX " DSA" +#define DSMR_DSA_TRANCHE_SUFFIX_LEN (sizeof(DSMR_DSA_TRANCHE_SUFFIX) - 1) +#define DSMR_DSA_TRANCHE_NAME_LEN (DSMR_NAME_LEN + DSMR_DSA_TRANCHE_SUFFIX_LEN) + typedef struct DSMRegistryCtxStruct { dsa_handle dsah; @@ -40,15 +62,55 @@ typedef struct DSMRegistryCtxStruct static DSMRegistryCtxStruct *DSMRegistryCtx; -typedef struct DSMRegistryEntry +typedef struct NamedDSMState { - char name[64]; dsm_handle handle; size_t size; +} NamedDSMState; + +typedef struct NamedDSAState +{ + dsa_handle handle; + int tranche; + char tranche_name[DSMR_DSA_TRANCHE_NAME_LEN]; +} NamedDSAState; + +typedef struct NamedDSHState +{ + NamedDSAState dsa; + dshash_table_handle handle; + int tranche; + char tranche_name[DSMR_NAME_LEN]; +} NamedDSHState; + +typedef enum DSMREntryType +{ + DSMR_ENTRY_TYPE_DSM, + DSMR_ENTRY_TYPE_DSA, + DSMR_ENTRY_TYPE_DSH, +} DSMREntryType; + +static const char *const DSMREntryTypeNames[] = +{ + [DSMR_ENTRY_TYPE_DSM] = "segment", + [DSMR_ENTRY_TYPE_DSA] = "area", + [DSMR_ENTRY_TYPE_DSH] = "hash", +}; + +typedef struct DSMRegistryEntry +{ + char name[DSMR_NAME_LEN]; + DSMREntryType type; + union + { + NamedDSMState dsm; + NamedDSAState dsa; + NamedDSHState dsh; + } data; } DSMRegistryEntry; static const dshash_parameters dsh_params = { - offsetof(DSMRegistryEntry, handle), + offsetof(DSMRegistryEntry, type), sizeof(DSMRegistryEntry), dshash_strcmp, dshash_strhash, @@ -141,7 +203,7 @@ GetNamedDSMSegment(const char *name, size_t size, ereport(ERROR, (errmsg("DSM segment name cannot be empty"))); - if (strlen(name) >= offsetof(DSMRegistryEntry, handle)) + if (strlen(name) >= offsetof(DSMRegistryEntry, type)) ereport(ERROR, (errmsg("DSM segment name too long"))); @@ -158,32 +220,39 @@ GetNamedDSMSegment(const char *name, size_t size, entry = dshash_find_or_insert(dsm_registry_table, name, found); if (!(*found)) { + NamedDSMState *state = &entry->data.dsm; + dsm_segment *seg; + + entry->type = DSMR_ENTRY_TYPE_DSM; + /* Initialize the segment. */ - dsm_segment *seg = dsm_create(size, 0); + seg = dsm_create(size, 0); dsm_pin_segment(seg); dsm_pin_mapping(seg); - entry->handle = dsm_segment_handle(seg); - entry->size = size; + state->handle = dsm_segment_handle(seg); + state->size = size; ret = dsm_segment_address(seg); if (init_callback) (*init_callback) (ret); } - else if (entry->size != size) - { + else if (entry->type != DSMR_ENTRY_TYPE_DSM) ereport(ERROR, - (errmsg("requested DSM segment size does not match size of " - "existing segment"))); - } + (errmsg("requested DSM segment does not match type of existing entry"))); + else if (entry->data.dsm.size != size) + ereport(ERROR, + (errmsg("requested DSM segment size does not match size of existing segment"))); else { - dsm_segment *seg = dsm_find_mapping(entry->handle); + NamedDSMState *state = &entry->data.dsm; + dsm_segment *seg; /* If the existing segment is not already attached, attach it now. */ + seg = dsm_find_mapping(state->handle); if (seg == NULL) { - seg = dsm_attach(entry->handle); + seg = dsm_attach(state->handle); if (seg == NULL) elog(ERROR, "could not map dynamic shared memory segment"); @@ -198,3 +267,220 @@ GetNamedDSMSegment(const char *name, size_t size, return ret; } + +/* + * Initialize or attach a named DSA. + * + * This routine returns a pointer to the DSA. A new LWLock tranche ID will be + * generated if needed. Note that the lock tranche will be registered with the + * provided name. Also note that this should be called at most once for a + * given DSA in each backend. + */ +dsa_area * +GetNamedDSA(const char *name, bool *found) +{ + DSMRegistryEntry *entry; + MemoryContext oldcontext; + dsa_area *ret; + + Assert(found); + + if (!name || *name == '\0') + ereport(ERROR, + (errmsg("DSA name cannot be empty"))); + + if (strlen(name) >= offsetof(DSMRegistryEntry, type)) + ereport(ERROR, + (errmsg("DSA name too long"))); + + /* Be sure any local memory allocated by DSM/DSA routines is persistent. */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + /* Connect to the registry. */ + init_dsm_registry(); + + entry = dshash_find_or_insert(dsm_registry_table, name, found); + if (!(*found)) + { + NamedDSAState *state = &entry->data.dsa; + + entry->type = DSMR_ENTRY_TYPE_DSA; + + /* Initialize the LWLock tranche for the DSA. */ + state->tranche = LWLockNewTrancheId(); + strcpy(state->tranche_name, name); + LWLockRegisterTranche(state->tranche, state->tranche_name); + + /* Initialize the DSA. */ + ret = dsa_create(state->tranche); + dsa_pin(ret); + dsa_pin_mapping(ret); + + /* Store handle for other backends to use. */ + state->handle = dsa_get_handle(ret); + } + else if (entry->type != DSMR_ENTRY_TYPE_DSA) + ereport(ERROR, + (errmsg("requested DSA does not match type of existing entry"))); + else + { + NamedDSAState *state = &entry->data.dsa; + + if (dsa_is_attached(state->handle)) + ereport(ERROR, + (errmsg("requested DSA already attached to current process"))); + + /* Initialize existing LWLock tranche for the DSA. */ + LWLockRegisterTranche(state->tranche, state->tranche_name); + + /* Attach to existing DSA. */ + ret = dsa_attach(state->handle); + dsa_pin_mapping(ret); + } + + dshash_release_lock(dsm_registry_table, entry); + MemoryContextSwitchTo(oldcontext); + + return ret; +} + +/* + * Initialize or attach a named dshash table. + * + * This routine returns the address of the table. The tranche_id member of + * params is ignored; new tranche IDs will be generated if needed. Note that + * the DSA lock tranche will be registered with the provided name with " DSA" + * appended. The dshash lock tranche will be registered with the provided + * name. Also note that this should be called at most once for a given table + * in each backend. + */ +dshash_table * +GetNamedDSHash(const char *name, const dshash_parameters *params, bool *found) +{ + DSMRegistryEntry *entry; + MemoryContext oldcontext; + dshash_table *ret; + + Assert(params); + Assert(found); + + if (!name || *name == '\0') + ereport(ERROR, + (errmsg("DSHash name cannot be empty"))); + + if (strlen(name) >= offsetof(DSMRegistryEntry, type)) + ereport(ERROR, + (errmsg("DSHash name too long"))); + + /* Be sure any local memory allocated by DSM/DSA routines is persistent. */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + /* Connect to the registry. */ + init_dsm_registry(); + + entry = dshash_find_or_insert(dsm_registry_table, name, found); + if (!(*found)) + { + NamedDSAState *dsa_state = &entry->data.dsh.dsa; + NamedDSHState *dsh_state = &entry->data.dsh; + dshash_parameters params_copy; + dsa_area *dsa; + + entry->type = DSMR_ENTRY_TYPE_DSH; + + /* Initialize the LWLock tranche for the DSA. */ + dsa_state->tranche = LWLockNewTrancheId(); + sprintf(dsa_state->tranche_name, "%s%s", name, DSMR_DSA_TRANCHE_SUFFIX); + LWLockRegisterTranche(dsa_state->tranche, dsa_state->tranche_name); + + /* Initialize the LWLock tranche for the dshash table. */ + dsh_state->tranche = LWLockNewTrancheId(); + strcpy(dsh_state->tranche_name, name); + LWLockRegisterTranche(dsh_state->tranche, dsh_state->tranche_name); + + /* Initialize the DSA for the hash table. */ + dsa = dsa_create(dsa_state->tranche); + dsa_pin(dsa); + dsa_pin_mapping(dsa); + + /* Initialize the dshash table. */ + memcpy(¶ms_copy, params, sizeof(dshash_parameters)); + params_copy.tranche_id = dsh_state->tranche; + ret = dshash_create(dsa, ¶ms_copy, NULL); + + /* Store handles for other backends to use. */ + dsa_state->handle = dsa_get_handle(dsa); + dsh_state->handle = dshash_get_hash_table_handle(ret); + } + else if (entry->type != DSMR_ENTRY_TYPE_DSH) + ereport(ERROR, + (errmsg("requested DSHash does not match type of existing entry"))); + else + { + NamedDSAState *dsa_state = &entry->data.dsh.dsa; + NamedDSHState *dsh_state = &entry->data.dsh; + dsa_area *dsa; + + /* XXX: Should we verify params matches what table was created with? */ + + if (dsa_is_attached(dsa_state->handle)) + ereport(ERROR, + (errmsg("requested DSHash already attached to current process"))); + + /* Initialize existing LWLock tranches for the DSA and dshash table. */ + LWLockRegisterTranche(dsa_state->tranche, dsa_state->tranche_name); + LWLockRegisterTranche(dsh_state->tranche, dsh_state->tranche_name); + + /* Attach to existing DSA for the hash table. */ + dsa = dsa_attach(dsa_state->handle); + dsa_pin_mapping(dsa); + + /* Attach to existing dshash table. */ + ret = dshash_attach(dsa, params, dsh_state->handle, NULL); + } + + dshash_release_lock(dsm_registry_table, entry); + MemoryContextSwitchTo(oldcontext); + + return ret; +} + +Datum +pg_get_dsm_registry_allocations(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + DSMRegistryEntry *entry; + MemoryContext oldcontext; + dshash_seq_status status; + + InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC); + + /* Be sure any local memory allocated by DSM/DSA routines is persistent. */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + init_dsm_registry(); + MemoryContextSwitchTo(oldcontext); + + dshash_seq_init(&status, dsm_registry_table, false); + while ((entry = dshash_seq_next(&status)) != NULL) + { + Datum vals[3]; + bool nulls[3] = {0}; + + vals[0] = CStringGetTextDatum(entry->name); + vals[1] = CStringGetTextDatum(DSMREntryTypeNames[entry->type]); + + /* + * Since we can't know the size of DSA/dshash entries without first + * attaching to them, return NULL for those. + */ + if (entry->type == DSMR_ENTRY_TYPE_DSM) + vals[2] = Int64GetDatum(entry->data.dsm.size); + else + nulls[2] = true; + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, vals, nulls); + } + dshash_seq_term(&status); + + return (Datum) 0; +} diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c index c6aefd2f688dd..beadeb5e46afa 100644 --- a/src/backend/storage/ipc/latch.c +++ b/src/backend/storage/ipc/latch.c @@ -187,9 +187,11 @@ WaitLatch(Latch *latch, int wakeEvents, long timeout, if (!(wakeEvents & WL_LATCH_SET)) latch = NULL; ModifyWaitEvent(LatchWaitSet, LatchWaitSetLatchPos, WL_LATCH_SET, latch); - ModifyWaitEvent(LatchWaitSet, LatchWaitSetPostmasterDeathPos, - (wakeEvents & (WL_EXIT_ON_PM_DEATH | WL_POSTMASTER_DEATH)), - NULL); + + if (IsUnderPostmaster) + ModifyWaitEvent(LatchWaitSet, LatchWaitSetPostmasterDeathPos, + (wakeEvents & (WL_EXIT_ON_PM_DEATH | WL_POSTMASTER_DEATH)), + NULL); if (WaitEventSetWait(LatchWaitSet, (wakeEvents & WL_TIMEOUT) ? timeout : -1, diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index e5b945a9ee39c..bf987aed8d327 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -1622,58 +1622,6 @@ TransactionIdIsInProgress(TransactionId xid) return false; } -/* - * TransactionIdIsActive -- is xid the top-level XID of an active backend? - * - * This differs from TransactionIdIsInProgress in that it ignores prepared - * transactions, as well as transactions running on the primary if we're in - * hot standby. Also, we ignore subtransactions since that's not needed - * for current uses. - */ -bool -TransactionIdIsActive(TransactionId xid) -{ - bool result = false; - ProcArrayStruct *arrayP = procArray; - TransactionId *other_xids = ProcGlobal->xids; - int i; - - /* - * Don't bother checking a transaction older than RecentXmin; it could not - * possibly still be running. - */ - if (TransactionIdPrecedes(xid, RecentXmin)) - return false; - - LWLockAcquire(ProcArrayLock, LW_SHARED); - - for (i = 0; i < arrayP->numProcs; i++) - { - int pgprocno = arrayP->pgprocnos[i]; - PGPROC *proc = &allProcs[pgprocno]; - TransactionId pxid; - - /* Fetch xid just once - see GetNewTransactionId */ - pxid = UINT32_ACCESS_ONCE(other_xids[i]); - - if (!TransactionIdIsValid(pxid)) - continue; - - if (proc->pid == 0) - continue; /* ignore prepared transactions */ - - if (TransactionIdEquals(pxid, xid)) - { - result = true; - break; - } - } - - LWLockRelease(ProcArrayLock); - - return result; -} - /* * Determine XID horizons. @@ -2866,8 +2814,10 @@ GetRunningTransactionData(void) * * Similar to GetSnapshotData but returns just oldestActiveXid. We include * all PGPROCs with an assigned TransactionId, even VACUUM processes. - * We look at all databases, though there is no need to include WALSender - * since this has no effect on hot standby conflicts. + * + * If allDbs is true, we look at all databases, though there is no need to + * include WALSender since this has no effect on hot standby conflicts. If + * allDbs is false, skip processes attached to other databases. * * This is never executed during recovery so there is no need to look at * KnownAssignedXids. @@ -2875,9 +2825,12 @@ GetRunningTransactionData(void) * We don't worry about updating other counters, we want to keep this as * simple as possible and leave GetSnapshotData() as the primary code for * that bookkeeping. + * + * inCommitOnly indicates getting the oldestActiveXid among the transactions + * in the commit critical section. */ TransactionId -GetOldestActiveTransactionId(void) +GetOldestActiveTransactionId(bool inCommitOnly, bool allDbs) { ProcArrayStruct *arrayP = procArray; TransactionId *other_xids = ProcGlobal->xids; @@ -2904,6 +2857,8 @@ GetOldestActiveTransactionId(void) for (index = 0; index < arrayP->numProcs; index++) { TransactionId xid; + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; /* Fetch xid just once - see GetNewTransactionId */ xid = UINT32_ACCESS_ONCE(other_xids[index]); @@ -2911,6 +2866,13 @@ GetOldestActiveTransactionId(void) if (!TransactionIdIsNormal(xid)) continue; + if (inCommitOnly && + (proc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0) + continue; + + if (!allDbs && proc->databaseId != MyDatabaseId) + continue; + if (TransactionIdPrecedes(xid, oldestRunningXid)) oldestRunningXid = xid; diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index a9bb540b55ac2..087821311cceb 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -728,7 +728,11 @@ procsignal_sigusr1_handler(SIGNAL_ARGS) void SendCancelRequest(int backendPID, const uint8 *cancel_key, int cancel_key_len) { - Assert(backendPID != 0); + if (backendPID == 0) + { + ereport(LOG, (errmsg("invalid cancel request with PID 0"))); + return; + } /* * See if we have a matching backend. Reading the pss_pid and diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index c9ae3b45b76b1..ca3656fc76f43 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -679,12 +679,10 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) */ for (i = 0; i < shm_ent_page_count; i++) { - volatile uint64 touch pg_attribute_unused(); - page_ptrs[i] = startptr + (i * os_page_size); if (firstNumaTouch) - pg_numa_touch_mem_if_required(touch, page_ptrs[i]); + pg_numa_touch_mem_if_required(page_ptrs[i]); CHECK_FOR_INTERRUPTS(); } diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 7fa8d9247e097..4222bdab07807 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -1376,7 +1376,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) if (xlrec.subxid_overflow) elog(DEBUG2, - "snapshot of %d running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)", + "snapshot of %d running transactions overflowed (lsn %X/%08X oldest xid %u latest complete %u next xid %u)", CurrRunningXacts->xcnt, LSN_FORMAT_ARGS(recptr), CurrRunningXacts->oldestRunningXid, @@ -1384,7 +1384,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) CurrRunningXacts->nextXid); else elog(DEBUG2, - "snapshot of %d+%d running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)", + "snapshot of %d+%d running transaction ids (lsn %X/%08X oldest xid %u latest complete %u next xid %u)", CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt, LSN_FORMAT_ARGS(recptr), CurrRunningXacts->oldestRunningXid, diff --git a/src/backend/storage/lmgr/generate-lwlocknames.pl b/src/backend/storage/lmgr/generate-lwlocknames.pl index 4441b7cba0c5f..cd3e43c448aed 100644 --- a/src/backend/storage/lmgr/generate-lwlocknames.pl +++ b/src/backend/storage/lmgr/generate-lwlocknames.pl @@ -10,7 +10,6 @@ my $output_path = '.'; my $lastlockidx = -1; -my $continue = "\n"; GetOptions('outdir:s' => \$output_path); @@ -28,18 +27,24 @@ # -# First, record the predefined LWLocks listed in wait_event_names.txt. We'll -# cross-check those with the ones in lwlocklist.h. +# First, record the predefined LWLocks and built-in tranches listed in +# wait_event_names.txt. We'll cross-check those with the ones in lwlocklist.h. # +my @wait_event_tranches; my @wait_event_lwlocks; my $record_lwlocks = 0; +my $in_tranches = 0; while (<$wait_event_names>) { chomp; # Check for end marker. - last if /^# END OF PREDEFINED LWLOCKS/; + if (/^# END OF PREDEFINED LWLOCKS/) + { + $in_tranches = 1; + next; + } # Skip comments and empty lines. next if /^#/; @@ -55,13 +60,29 @@ # Go to the next line if we are not yet recording LWLocks. next if not $record_lwlocks; + # Stop recording if we reach another section. + last if /^Section:/; + # Record the LWLock. (my $waiteventname, my $waitevendocsentence) = split(/\t/, $_); - push(@wait_event_lwlocks, $waiteventname); + + if ($in_tranches) + { + push(@wait_event_tranches, $waiteventname); + } + else + { + push(@wait_event_lwlocks, $waiteventname); + } } +# +# While gathering the list of predefined LWLocks, cross-check the lists in +# lwlocklist.h with the wait events we just recorded. +# my $in_comment = 0; -my $i = 0; +my $lwlock_count = 0; +my $tranche_count = 0; while (<$lwlocklist>) { chomp; @@ -82,40 +103,72 @@ next; } - die "unable to parse lwlocklist.h line \"$_\"" - unless /^PG_LWLOCK\((\d+),\s+(\w+)\)$/; + # + # Gather list of predefined LWLocks and cross-check with the wait events. + # + if (/^PG_LWLOCK\((\d+),\s+(\w+)\)$/) + { + my ($lockidx, $lockname) = ($1, $2); - (my $lockidx, my $lockname) = ($1, $2); + die "lwlocklist.h not in order" if $lockidx < $lastlockidx; + die "lwlocklist.h has duplicates" if $lockidx == $lastlockidx; - die "lwlocklist.h not in order" if $lockidx < $lastlockidx; - die "lwlocklist.h has duplicates" if $lockidx == $lastlockidx; + die "$lockname defined in lwlocklist.h but missing from " + . "wait_event_names.txt" + if $lwlock_count >= scalar @wait_event_lwlocks; + die "lists of predefined LWLocks do not match (first mismatch at " + . "$wait_event_lwlocks[$lwlock_count] in wait_event_names.txt and " + . "$lockname in lwlocklist.h)" + if $wait_event_lwlocks[$lwlock_count] ne $lockname; - die "$lockname defined in lwlocklist.h but missing from " - . "wait_event_names.txt" - if $i >= scalar @wait_event_lwlocks; - die "lists of predefined LWLocks do not match (first mismatch at " - . "$wait_event_lwlocks[$i] in wait_event_names.txt and $lockname in " - . "lwlocklist.h)" - if $wait_event_lwlocks[$i] ne $lockname; - $i++; + $lwlock_count++; - while ($lastlockidx < $lockidx - 1) + while ($lastlockidx < $lockidx - 1) + { + ++$lastlockidx; + } + $lastlockidx = $lockidx; + + # Add a "Lock" suffix to each lock name, as the C code depends on that. + printf $h "#define %-32s (&MainLWLockArray[$lockidx].lock)\n", + $lockname . "Lock"; + + next; + } + + # + # Cross-check the built-in LWLock tranches with the wait events. + # + if (/^PG_LWLOCKTRANCHE\((\w+),\s+(\w+)\)$/) { - ++$lastlockidx; - $continue = ",\n"; + my ($tranche_id, $tranche_name) = ($1, $2); + + die "$tranche_name defined in lwlocklist.h but missing from " + . "wait_event_names.txt" + if $tranche_count >= scalar @wait_event_tranches; + die + "lists of built-in LWLock tranches do not match (first mismatch at " + . "$wait_event_tranches[$tranche_count] in wait_event_names.txt and " + . "$tranche_name in lwlocklist.h)" + if $wait_event_tranches[$tranche_count] ne $tranche_name; + + $tranche_count++; + + next; } - $lastlockidx = $lockidx; - $continue = ",\n"; - # Add a "Lock" suffix to each lock name, as the C code depends on that - printf $h "#define %-32s (&MainLWLockArray[$lockidx].lock)\n", - $lockname . "Lock"; + die "unable to parse lwlocklist.h line \"$_\""; } die - "$wait_event_lwlocks[$i] defined in wait_event_names.txt but missing from " - . "lwlocklist.h" - if $i < scalar @wait_event_lwlocks; + "$wait_event_lwlocks[$lwlock_count] defined in wait_event_names.txt but " + . " missing from lwlocklist.h" + if $lwlock_count < scalar @wait_event_lwlocks; + +die + "$wait_event_tranches[$tranche_count] defined in wait_event_names.txt but " + . "missing from lwlocklist.h" + if $tranche_count < scalar @wait_event_tranches; print $h "\n"; printf $h "#define NUM_INDIVIDUAL_LWLOCKS %s\n", $lastlockidx + 1; diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 2776ceb295be4..62f3471448ebc 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -3539,9 +3539,9 @@ AtPrepare_Locks(void) * but that probably costs more cycles. */ void -PostPrepare_Locks(TransactionId xid) +PostPrepare_Locks(FullTransactionId fxid) { - PGPROC *newproc = TwoPhaseGetDummyProc(xid, false); + PGPROC *newproc = TwoPhaseGetDummyProc(fxid, false); HASH_SEQ_STATUS status; LOCALLOCK *locallock; LOCK *lock; @@ -4324,11 +4324,11 @@ DumpAllLocks(void) * and PANIC anyway. */ void -lock_twophase_recover(TransactionId xid, uint16 info, +lock_twophase_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata; - PGPROC *proc = TwoPhaseGetDummyProc(xid, false); + PGPROC *proc = TwoPhaseGetDummyProc(fxid, false); LOCKTAG *locktag; LOCKMODE lockmode; LOCKMETHODID lockmethodid; @@ -4505,7 +4505,7 @@ lock_twophase_recover(TransactionId xid, uint16 info, * starting up into hot standby mode. */ void -lock_twophase_standby_recover(TransactionId xid, uint16 info, +lock_twophase_standby_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata; @@ -4524,7 +4524,7 @@ lock_twophase_standby_recover(TransactionId xid, uint16 info, if (lockmode == AccessExclusiveLock && locktag->locktag_type == LOCKTAG_RELATION) { - StandbyAcquireAccessExclusiveLock(xid, + StandbyAcquireAccessExclusiveLock(XidFromFullTransactionId(fxid), locktag->locktag_field1 /* dboid */ , locktag->locktag_field2 /* reloid */ ); } @@ -4537,11 +4537,11 @@ lock_twophase_standby_recover(TransactionId xid, uint16 info, * Find and release the lock indicated by the 2PC record. */ void -lock_twophase_postcommit(TransactionId xid, uint16 info, +lock_twophase_postcommit(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata; - PGPROC *proc = TwoPhaseGetDummyProc(xid, true); + PGPROC *proc = TwoPhaseGetDummyProc(fxid, true); LOCKTAG *locktag; LOCKMETHODID lockmethodid; LockMethod lockMethodTable; @@ -4563,10 +4563,10 @@ lock_twophase_postcommit(TransactionId xid, uint16 info, * This is actually just the same as the COMMIT case. */ void -lock_twophase_postabort(TransactionId xid, uint16 info, +lock_twophase_postabort(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { - lock_twophase_postcommit(xid, info, recdata, len); + lock_twophase_postcommit(fxid, info, recdata, len); } /* diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 46f44bc45113f..ec9c345ffdfb8 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -122,9 +122,8 @@ StaticAssertDecl((LW_VAL_EXCLUSIVE & LW_FLAG_MASK) == 0, * own tranche. We absorb the names of these tranches from there into * BuiltinTrancheNames here. * - * 2. There are some predefined tranches for built-in groups of locks. - * These are listed in enum BuiltinTrancheIds in lwlock.h, and their names - * appear in BuiltinTrancheNames[] below. + * 2. There are some predefined tranches for built-in groups of locks defined + * in lwlocklist.h. We absorb the names of these tranches, too. * * 3. Extensions can create new tranches, via either RequestNamedLWLockTranche * or LWLockRegisterTranche. The names of these that are known in the current @@ -135,49 +134,10 @@ StaticAssertDecl((LW_VAL_EXCLUSIVE & LW_FLAG_MASK) == 0, */ static const char *const BuiltinTrancheNames[] = { #define PG_LWLOCK(id, lockname) [id] = CppAsString(lockname), +#define PG_LWLOCKTRANCHE(id, lockname) [LWTRANCHE_##id] = CppAsString(lockname), #include "storage/lwlocklist.h" #undef PG_LWLOCK - [LWTRANCHE_XACT_BUFFER] = "XactBuffer", - [LWTRANCHE_COMMITTS_BUFFER] = "CommitTsBuffer", - [LWTRANCHE_SUBTRANS_BUFFER] = "SubtransBuffer", - [LWTRANCHE_MULTIXACTOFFSET_BUFFER] = "MultiXactOffsetBuffer", - [LWTRANCHE_MULTIXACTMEMBER_BUFFER] = "MultiXactMemberBuffer", - [LWTRANCHE_NOTIFY_BUFFER] = "NotifyBuffer", - [LWTRANCHE_SERIAL_BUFFER] = "SerialBuffer", - [LWTRANCHE_WAL_INSERT] = "WALInsert", - [LWTRANCHE_BUFFER_CONTENT] = "BufferContent", - [LWTRANCHE_REPLICATION_ORIGIN_STATE] = "ReplicationOriginState", - [LWTRANCHE_REPLICATION_SLOT_IO] = "ReplicationSlotIO", - [LWTRANCHE_LOCK_FASTPATH] = "LockFastPath", - [LWTRANCHE_BUFFER_MAPPING] = "BufferMapping", - [LWTRANCHE_LOCK_MANAGER] = "LockManager", - [LWTRANCHE_PREDICATE_LOCK_MANAGER] = "PredicateLockManager", - [LWTRANCHE_PARALLEL_HASH_JOIN] = "ParallelHashJoin", - [LWTRANCHE_PARALLEL_BTREE_SCAN] = "ParallelBtreeScan", - [LWTRANCHE_PARALLEL_QUERY_DSA] = "ParallelQueryDSA", - [LWTRANCHE_PER_SESSION_DSA] = "PerSessionDSA", - [LWTRANCHE_PER_SESSION_RECORD_TYPE] = "PerSessionRecordType", - [LWTRANCHE_PER_SESSION_RECORD_TYPMOD] = "PerSessionRecordTypmod", - [LWTRANCHE_SHARED_TUPLESTORE] = "SharedTupleStore", - [LWTRANCHE_SHARED_TIDBITMAP] = "SharedTidBitmap", - [LWTRANCHE_PARALLEL_APPEND] = "ParallelAppend", - [LWTRANCHE_PER_XACT_PREDICATE_LIST] = "PerXactPredicateList", - [LWTRANCHE_PGSTATS_DSA] = "PgStatsDSA", - [LWTRANCHE_PGSTATS_HASH] = "PgStatsHash", - [LWTRANCHE_PGSTATS_DATA] = "PgStatsData", - [LWTRANCHE_LAUNCHER_DSA] = "LogicalRepLauncherDSA", - [LWTRANCHE_LAUNCHER_HASH] = "LogicalRepLauncherHash", - [LWTRANCHE_DSM_REGISTRY_DSA] = "DSMRegistryDSA", - [LWTRANCHE_DSM_REGISTRY_HASH] = "DSMRegistryHash", - [LWTRANCHE_COMMITTS_SLRU] = "CommitTsSLRU", - [LWTRANCHE_MULTIXACTOFFSET_SLRU] = "MultixactOffsetSLRU", - [LWTRANCHE_MULTIXACTMEMBER_SLRU] = "MultixactMemberSLRU", - [LWTRANCHE_NOTIFY_SLRU] = "NotifySLRU", - [LWTRANCHE_SERIAL_SLRU] = "SerialSLRU", - [LWTRANCHE_SUBTRANS_SLRU] = "SubtransSLRU", - [LWTRANCHE_XACT_SLRU] = "XactSLRU", - [LWTRANCHE_PARALLEL_VACUUM_DSA] = "ParallelVacuumDSA", - [LWTRANCHE_AIO_URING_COMPLETION] = "AioUringCompletion", +#undef PG_LWLOCKTRANCHE }; StaticAssertDecl(lengthof(BuiltinTrancheNames) == diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index d82114ffca165..c07fb58835557 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -191,7 +191,7 @@ * AtPrepare_PredicateLocks(void); * PostPrepare_PredicateLocks(TransactionId xid); * PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit); - * predicatelock_twophase_recover(TransactionId xid, uint16 info, + * predicatelock_twophase_recover(FullTransactionId fxid, uint16 info, * void *recdata, uint32 len); */ @@ -4856,7 +4856,7 @@ AtPrepare_PredicateLocks(void) * anyway. We only need to clean up our local state. */ void -PostPrepare_PredicateLocks(TransactionId xid) +PostPrepare_PredicateLocks(FullTransactionId fxid) { if (MySerializableXact == InvalidSerializableXact) return; @@ -4879,12 +4879,12 @@ PostPrepare_PredicateLocks(TransactionId xid) * commits or aborts. */ void -PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit) +PredicateLockTwoPhaseFinish(FullTransactionId fxid, bool isCommit) { SERIALIZABLEXID *sxid; SERIALIZABLEXIDTAG sxidtag; - sxidtag.xid = xid; + sxidtag.xid = XidFromFullTransactionId(fxid); LWLockAcquire(SerializableXactHashLock, LW_SHARED); sxid = (SERIALIZABLEXID *) @@ -4906,10 +4906,11 @@ PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit) * Re-acquire a predicate lock belonging to a transaction that was prepared. */ void -predicatelock_twophase_recover(TransactionId xid, uint16 info, +predicatelock_twophase_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { TwoPhasePredicateRecord *record; + TransactionId xid = XidFromFullTransactionId(fxid); Assert(len == sizeof(TwoPhasePredicateRecord)); diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 2f8c3d5f91822..0cecd4649020f 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -988,6 +988,7 @@ pg_plan_queries(List *querytrees, const char *query_string, int cursorOptions, stmt->stmt_location = query->stmt_location; stmt->stmt_len = query->stmt_len; stmt->queryId = query->queryId; + stmt->planOrigin = PLAN_STMT_INTERNAL; } else { diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index d1593f38b35fd..08791b8f75ec2 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -1350,24 +1350,15 @@ PortalRunMulti(Portal portal, PopActiveSnapshot(); /* - * If a query completion data was supplied, use it. Otherwise use the - * portal's query completion data. - * - * Exception: Clients expect INSERT/UPDATE/DELETE tags to have counts, so - * fake them with zeros. This can happen with DO INSTEAD rules if there - * is no replacement query of the same type as the original. We print "0 - * 0" here because technically there is no query of the matching tag type, - * and printing a non-zero count for a different query type seems wrong, - * e.g. an INSERT that does an UPDATE instead should not print "0 1" if - * one row was updated. See QueryRewrite(), step 3, for details. + * If a command tag was requested and we did not fill in a run-time- + * determined tag above, copy the parse-time tag from the Portal. (There + * might not be any tag there either, in edge cases such as empty prepared + * statements. That's OK.) */ - if (qc && qc->commandTag == CMDTAG_UNKNOWN) - { - if (portal->qc.commandTag != CMDTAG_UNKNOWN) - CopyQueryCompletion(qc, &portal->qc); - /* If the caller supplied a qc, we should have set it by now. */ - Assert(qc->commandTag != CMDTAG_UNKNOWN); - } + if (qc && + qc->commandTag == CMDTAG_UNKNOWN && + portal->qc.commandTag != CMDTAG_UNKNOWN) + CopyQueryCompletion(qc, &portal->qc); } /* diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 25fe3d5801665..4f4191b0ea6b4 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -943,17 +943,7 @@ standard_ProcessUtility(PlannedStmt *pstmt, break; case T_CheckPointStmt: - if (!has_privs_of_role(GetUserId(), ROLE_PG_CHECKPOINT)) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - /* translator: %s is name of a SQL command, eg CHECKPOINT */ - errmsg("permission denied to execute %s command", - "CHECKPOINT"), - errdetail("Only roles with privileges of the \"%s\" role may execute this command.", - "pg_checkpoint"))); - - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT | - (RecoveryInProgress() ? 0 : CHECKPOINT_FORCE)); + ExecCheckpoint(pstate, (CheckPointStmt *) parsetree); break; /* @@ -1244,6 +1234,7 @@ ProcessUtilitySlow(ParseState *pstate, wrapper->utilityStmt = stmt; wrapper->stmt_location = pstmt->stmt_location; wrapper->stmt_len = pstmt->stmt_len; + wrapper->planOrigin = PLAN_STMT_INTERNAL; ProcessUtility(wrapper, queryString, @@ -1343,7 +1334,7 @@ ProcessUtilitySlow(ParseState *pstate, */ switch (stmt->subtype) { - case 'T': /* ALTER DOMAIN DEFAULT */ + case AD_AlterDefault: /* * Recursively alter column default for table and, @@ -1353,30 +1344,30 @@ ProcessUtilitySlow(ParseState *pstate, AlterDomainDefault(stmt->typeName, stmt->def); break; - case 'N': /* ALTER DOMAIN DROP NOT NULL */ + case AD_DropNotNull: address = AlterDomainNotNull(stmt->typeName, false); break; - case 'O': /* ALTER DOMAIN SET NOT NULL */ + case AD_SetNotNull: address = AlterDomainNotNull(stmt->typeName, true); break; - case 'C': /* ADD CONSTRAINT */ + case AD_AddConstraint: address = AlterDomainAddConstraint(stmt->typeName, stmt->def, &secondaryObject); break; - case 'X': /* DROP CONSTRAINT */ + case AD_DropConstraint: address = AlterDomainDropConstraint(stmt->typeName, stmt->name, stmt->behavior, stmt->missing_ok); break; - case 'V': /* VALIDATE CONSTRAINT */ + case AD_ValidateConstraint: address = AlterDomainValidateConstraint(stmt->typeName, stmt->name); @@ -1974,6 +1965,7 @@ ProcessUtilityForAlterTable(Node *stmt, AlterTableUtilityContext *context) wrapper->utilityStmt = stmt; wrapper->stmt_location = context->pstmt->stmt_location; wrapper->stmt_len = context->pstmt->stmt_len; + wrapper->planOrigin = PLAN_STMT_INTERNAL; ProcessUtility(wrapper, context->queryString, diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c index b77d8c23d3694..4801fe90089e6 100644 --- a/src/backend/tsearch/ts_locale.c +++ b/src/backend/tsearch/ts_locale.c @@ -36,7 +36,7 @@ t_isalpha(const char *ptr) { int clen = pg_mblen(ptr); wchar_t character[WC_BUF_LEN]; - pg_locale_t mylocale = 0; /* TODO */ + locale_t mylocale = 0; /* TODO */ if (clen == 1 || database_ctype_is_c) return isalpha(TOUCHAR(ptr)); @@ -51,7 +51,7 @@ t_isalnum(const char *ptr) { int clen = pg_mblen(ptr); wchar_t character[WC_BUF_LEN]; - pg_locale_t mylocale = 0; /* TODO */ + locale_t mylocale = 0; /* TODO */ if (clen == 1 || database_ctype_is_c) return isalnum(TOUCHAR(ptr)); diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index 79bcd32a0639e..e2dd3da3aa35f 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -299,7 +299,7 @@ TParserInit(char *str, int len) */ if (prs->charmaxlen > 1) { - pg_locale_t mylocale = 0; /* TODO */ + locale_t mylocale = 0; /* TODO */ prs->usewide = true; if (database_ctype_is_c) diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c index 8b57845e8709f..6bc91ce0dadda 100644 --- a/src/backend/utils/activity/pgstat.c +++ b/src/backend/utils/activity/pgstat.c @@ -212,6 +212,11 @@ int pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_CACHE; PgStat_LocalState pgStatLocal; +/* + * Track pending reports for fixed-numbered stats, used by + * pgstat_report_stat(). + */ +bool pgstat_report_fixed = false; /* ---------- * Local data @@ -370,7 +375,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .shared_data_off = offsetof(PgStatShared_Backend, stats), .shared_data_len = sizeof(((PgStatShared_Backend *) 0)->stats), - .have_static_pending_cb = pgstat_backend_have_pending_cb, .flush_static_cb = pgstat_backend_flush_cb, .reset_timestamp_cb = pgstat_backend_reset_timestamp_cb, }, @@ -437,7 +441,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .shared_data_len = sizeof(((PgStatShared_IO *) 0)->stats), .flush_static_cb = pgstat_io_flush_cb, - .have_static_pending_cb = pgstat_io_have_pending_cb, .init_shmem_cb = pgstat_io_init_shmem_cb, .reset_all_cb = pgstat_io_reset_all_cb, .snapshot_cb = pgstat_io_snapshot_cb, @@ -455,7 +458,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .shared_data_len = sizeof(((PgStatShared_SLRU *) 0)->stats), .flush_static_cb = pgstat_slru_flush_cb, - .have_static_pending_cb = pgstat_slru_have_pending_cb, .init_shmem_cb = pgstat_slru_init_shmem_cb, .reset_all_cb = pgstat_slru_reset_all_cb, .snapshot_cb = pgstat_slru_snapshot_cb, @@ -474,7 +476,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .init_backend_cb = pgstat_wal_init_backend_cb, .flush_static_cb = pgstat_wal_flush_cb, - .have_static_pending_cb = pgstat_wal_have_pending_cb, .init_shmem_cb = pgstat_wal_init_shmem_cb, .reset_all_cb = pgstat_wal_reset_all_cb, .snapshot_cb = pgstat_wal_snapshot_cb, @@ -708,29 +709,10 @@ pgstat_report_stat(bool force) } /* Don't expend a clock check if nothing to do */ - if (dlist_is_empty(&pgStatPending)) + if (dlist_is_empty(&pgStatPending) && + !pgstat_report_fixed) { - bool do_flush = false; - - /* Check for pending stats */ - for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) - { - const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); - - if (!kind_info) - continue; - if (!kind_info->have_static_pending_cb) - continue; - - if (kind_info->have_static_pending_cb()) - { - do_flush = true; - break; - } - } - - if (!do_flush) - return 0; + return 0; } /* @@ -784,16 +766,19 @@ pgstat_report_stat(bool force) partial_flush |= pgstat_flush_pending_entries(nowait); /* flush of other stats kinds */ - for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) + if (pgstat_report_fixed) { - const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) + { + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); - if (!kind_info) - continue; - if (!kind_info->flush_static_cb) - continue; + if (!kind_info) + continue; + if (!kind_info->flush_static_cb) + continue; - partial_flush |= kind_info->flush_static_cb(nowait); + partial_flush |= kind_info->flush_static_cb(nowait); + } } last_flush = now; @@ -815,6 +800,7 @@ pgstat_report_stat(bool force) } pending_since = 0; + pgstat_report_fixed = false; return 0; } diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c index 51256277e8d37..8714a85e2d936 100644 --- a/src/backend/utils/activity/pgstat_backend.c +++ b/src/backend/utils/activity/pgstat_backend.c @@ -66,6 +66,7 @@ pgstat_count_backend_io_op_time(IOObject io_object, IOContext io_context, io_time); backend_has_iostats = true; + pgstat_report_fixed = true; } void @@ -81,6 +82,7 @@ pgstat_count_backend_io_op(IOObject io_object, IOContext io_context, PendingBackendStats.pending_io.bytes[io_object][io_context][io_op] += bytes; backend_has_iostats = true; + pgstat_report_fixed = true; } /* @@ -301,18 +303,6 @@ pgstat_flush_backend(bool nowait, bits32 flags) return false; } -/* - * Check if there are any backend stats waiting for flush. - */ -bool -pgstat_backend_have_pending_cb(void) -{ - if (!pgstat_tracks_backend_bktype(MyBackendType)) - return false; - - return (backend_has_iostats || pgstat_backend_wal_have_pending()); -} - /* * Callback to flush out locally pending backend statistics. * diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index d8d26379a571e..13ae57ed6498d 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -80,6 +80,7 @@ pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, pgstat_count_backend_io_op(io_object, io_context, io_op, cnt, bytes); have_iostats = true; + pgstat_report_fixed = true; } /* @@ -167,15 +168,6 @@ pgstat_fetch_stat_io(void) return &pgStatLocal.snapshot.io; } -/* - * Check if there any IO stats waiting for flush. - */ -bool -pgstat_io_have_pending_cb(void) -{ - return have_iostats; -} - /* * Simpler wrapper of pgstat_io_flush_cb() */ diff --git a/src/backend/utils/activity/pgstat_relation.c b/src/backend/utils/activity/pgstat_relation.c index 28587e2916b1d..69df741cbf630 100644 --- a/src/backend/utils/activity/pgstat_relation.c +++ b/src/backend/utils/activity/pgstat_relation.c @@ -744,7 +744,7 @@ PostPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state) * Load the saved counts into our local pgstats state. */ void -pgstat_twophase_postcommit(TransactionId xid, uint16 info, +pgstat_twophase_postcommit(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata; @@ -780,7 +780,7 @@ pgstat_twophase_postcommit(TransactionId xid, uint16 info, * as aborted. */ void -pgstat_twophase_postabort(TransactionId xid, uint16 info, +pgstat_twophase_postabort(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata; diff --git a/src/backend/utils/activity/pgstat_slru.c b/src/backend/utils/activity/pgstat_slru.c index b9e940dde45b6..7bd8744accb0e 100644 --- a/src/backend/utils/activity/pgstat_slru.c +++ b/src/backend/utils/activity/pgstat_slru.c @@ -143,15 +143,6 @@ pgstat_get_slru_index(const char *name) return (SLRU_NUM_ELEMENTS - 1); } -/* - * Check if there are any SLRU stats entries waiting for flush. - */ -bool -pgstat_slru_have_pending_cb(void) -{ - return have_slrustats; -} - /* * Flush out locally pending SLRU stats entries * @@ -247,6 +238,7 @@ get_slru_entry(int slru_idx) Assert((slru_idx >= 0) && (slru_idx < SLRU_NUM_ELEMENTS)); have_slrustats = true; + pgstat_report_fixed = true; return &pending_SLRUStats[slru_idx]; } diff --git a/src/backend/utils/activity/pgstat_wal.c b/src/backend/utils/activity/pgstat_wal.c index 16a1ecb4d90d2..0d04480d2f6d0 100644 --- a/src/backend/utils/activity/pgstat_wal.c +++ b/src/backend/utils/activity/pgstat_wal.c @@ -71,6 +71,15 @@ pgstat_fetch_stat_wal(void) return &pgStatLocal.snapshot.wal; } +/* + * To determine whether WAL usage happened. + */ +static inline bool +pgstat_wal_have_pending(void) +{ + return pgWalUsage.wal_records != prevWalUsage.wal_records; +} + /* * Calculate how much WAL usage counters have increased by subtracting the * previous counters from the current ones. @@ -92,7 +101,7 @@ pgstat_wal_flush_cb(bool nowait) * This function can be called even if nothing at all has happened. Avoid * taking lock for nothing in that case. */ - if (!pgstat_wal_have_pending_cb()) + if (!pgstat_wal_have_pending()) return false; /* @@ -136,15 +145,6 @@ pgstat_wal_init_backend_cb(void) prevWalUsage = pgWalUsage; } -/* - * To determine whether WAL usage happened. - */ -bool -pgstat_wal_have_pending_cb(void) -{ - return pgWalUsage.wal_records != prevWalUsage.wal_records; -} - void pgstat_wal_init_shmem_cb(void *stats) { diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 4da68312b5f97..0be307d2ca04b 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -356,9 +356,13 @@ AioWorkerSubmissionQueue "Waiting to access AIO worker submission queue." # # END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE) # -# Predefined LWLocks (i.e., those declared in lwlocknames.h) must be listed -# in the section above and must be listed in the same order as in -# lwlocknames.h. Other LWLocks must be listed in the section below. +# Predefined LWLocks (i.e., those declared at the top of lwlocknames.h) must be +# listed in the section above and must be listed in the same order as in +# lwlocknames.h. +# +# Likewise, the built-in LWLock tranches (i.e., those declared at the bottom of +# lwlocknames.h) must be listed in the section below and must be listed in the +# same order as in lwlocknames.h. # XactBuffer "Waiting for I/O on a transaction status SLRU buffer." diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index 4a233b63c3280..ffeacf2b819f3 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -23,6 +23,7 @@ OBJS = \ arrayutils.o \ ascii.o \ bool.o \ + bytea.o \ cash.o \ char.o \ cryptohashfuncs.o \ diff --git a/src/backend/utils/adt/acl.c b/src/backend/utils/adt/acl.c index ca3c5ee3df3ae..1213f9106d515 100644 --- a/src/backend/utils/adt/acl.c +++ b/src/backend/utils/adt/acl.c @@ -134,6 +134,22 @@ static AclResult pg_role_aclcheck(Oid role_oid, Oid roleid, AclMode mode); static void RoleMembershipCacheCallback(Datum arg, int cacheid, uint32 hashvalue); +/* + * Test whether an identifier char can be left unquoted in ACLs. + * + * Formerly, we used isalnum() even on non-ASCII characters, resulting in + * unportable behavior. To ensure dump compatibility with old versions, + * we now treat high-bit-set characters as always requiring quoting during + * putid(), but getid() will always accept them without quotes. + */ +static inline bool +is_safe_acl_char(unsigned char c, bool is_getid) +{ + if (IS_HIGHBIT_SET(c)) + return is_getid; + return isalnum(c) || c == '_'; +} + /* * getid * Consumes the first alphanumeric string (identifier) found in string @@ -159,21 +175,22 @@ getid(const char *s, char *n, Node *escontext) while (isspace((unsigned char) *s)) s++; - /* This code had better match what putid() does, below */ for (; *s != '\0' && - (isalnum((unsigned char) *s) || - *s == '_' || - *s == '"' || - in_quotes); + (in_quotes || *s == '"' || is_safe_acl_char(*s, true)); s++) { if (*s == '"') { + if (!in_quotes) + { + in_quotes = true; + continue; + } /* safe to look at next char (could be '\0' though) */ if (*(s + 1) != '"') { - in_quotes = !in_quotes; + in_quotes = false; continue; } /* it's an escaped double quote; skip the escaping char */ @@ -207,10 +224,10 @@ putid(char *p, const char *s) const char *src; bool safe = true; + /* Detect whether we need to use double quotes */ for (src = s; *src; src++) { - /* This test had better match what getid() does, above */ - if (!isalnum((unsigned char) *src) && *src != '_') + if (!is_safe_acl_char(*src, false)) { safe = false; break; diff --git a/src/backend/utils/adt/bytea.c b/src/backend/utils/adt/bytea.c new file mode 100644 index 0000000000000..6e7b914c56395 --- /dev/null +++ b/src/backend/utils/adt/bytea.c @@ -0,0 +1,1114 @@ +/*------------------------------------------------------------------------- + * + * bytea.c + * Functions for the bytea type. + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/utils/adt/bytea.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/detoast.h" +#include "catalog/pg_collation_d.h" +#include "catalog/pg_type_d.h" +#include "common/int.h" +#include "fmgr.h" +#include "libpq/pqformat.h" +#include "port/pg_bitutils.h" +#include "utils/builtins.h" +#include "utils/bytea.h" +#include "utils/fmgrprotos.h" +#include "utils/memutils.h" +#include "utils/sortsupport.h" +#include "utils/varlena.h" +#include "varatt.h" + +/* GUC variable */ +int bytea_output = BYTEA_OUTPUT_HEX; + +static bytea *bytea_catenate(bytea *t1, bytea *t2); +static bytea *bytea_substring(Datum str, int S, int L, + bool length_not_specified); +static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl); + +/* + * bytea_catenate + * Guts of byteacat(), broken out so it can be used by other functions + * + * Arguments can be in short-header form, but not compressed or out-of-line + */ +static bytea * +bytea_catenate(bytea *t1, bytea *t2) +{ + bytea *result; + int len1, + len2, + len; + char *ptr; + + len1 = VARSIZE_ANY_EXHDR(t1); + len2 = VARSIZE_ANY_EXHDR(t2); + + /* paranoia ... probably should throw error instead? */ + if (len1 < 0) + len1 = 0; + if (len2 < 0) + len2 = 0; + + len = len1 + len2 + VARHDRSZ; + result = (bytea *) palloc(len); + + /* Set size of result string... */ + SET_VARSIZE(result, len); + + /* Fill data field of result string... */ + ptr = VARDATA(result); + if (len1 > 0) + memcpy(ptr, VARDATA_ANY(t1), len1); + if (len2 > 0) + memcpy(ptr + len1, VARDATA_ANY(t2), len2); + + return result; +} + +#define PG_STR_GET_BYTEA(str_) \ + DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_))) + +static bytea * +bytea_substring(Datum str, + int S, + int L, + bool length_not_specified) +{ + int32 S1; /* adjusted start position */ + int32 L1; /* adjusted substring length */ + int32 E; /* end position */ + + /* + * The logic here should generally match text_substring(). + */ + S1 = Max(S, 1); + + if (length_not_specified) + { + /* + * Not passed a length - DatumGetByteaPSlice() grabs everything to the + * end of the string if we pass it a negative value for length. + */ + L1 = -1; + } + else if (L < 0) + { + /* SQL99 says to throw an error for E < S, i.e., negative length */ + ereport(ERROR, + (errcode(ERRCODE_SUBSTRING_ERROR), + errmsg("negative substring length not allowed"))); + L1 = -1; /* silence stupider compilers */ + } + else if (pg_add_s32_overflow(S, L, &E)) + { + /* + * L could be large enough for S + L to overflow, in which case the + * substring must run to end of string. + */ + L1 = -1; + } + else + { + /* + * A zero or negative value for the end position can happen if the + * start was negative or one. SQL99 says to return a zero-length + * string. + */ + if (E < 1) + return PG_STR_GET_BYTEA(""); + + L1 = E - S1; + } + + /* + * If the start position is past the end of the string, SQL99 says to + * return a zero-length string -- DatumGetByteaPSlice() will do that for + * us. We need only convert S1 to zero-based starting position. + */ + return DatumGetByteaPSlice(str, S1 - 1, L1); +} + +static bytea * +bytea_overlay(bytea *t1, bytea *t2, int sp, int sl) +{ + bytea *result; + bytea *s1; + bytea *s2; + int sp_pl_sl; + + /* + * Check for possible integer-overflow cases. For negative sp, throw a + * "substring length" error because that's what should be expected + * according to the spec's definition of OVERLAY(). + */ + if (sp <= 0) + ereport(ERROR, + (errcode(ERRCODE_SUBSTRING_ERROR), + errmsg("negative substring length not allowed"))); + if (pg_add_s32_overflow(sp, sl, &sp_pl_sl)) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("integer out of range"))); + + s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false); + s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true); + result = bytea_catenate(s1, t2); + result = bytea_catenate(result, s2); + + return result; +} + +/***************************************************************************** + * USER I/O ROUTINES * + *****************************************************************************/ + +#define VAL(CH) ((CH) - '0') +#define DIG(VAL) ((VAL) + '0') + +/* + * byteain - converts from printable representation of byte array + * + * Non-printable characters must be passed as '\nnn' (octal) and are + * converted to internal form. '\' must be passed as '\\'. + */ +Datum +byteain(PG_FUNCTION_ARGS) +{ + char *inputText = PG_GETARG_CSTRING(0); + Node *escontext = fcinfo->context; + size_t len = strlen(inputText); + size_t bc; + char *tp; + char *rp; + bytea *result; + + /* Recognize hex input */ + if (inputText[0] == '\\' && inputText[1] == 'x') + { + bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */ + result = palloc(bc); + bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result), + escontext); + SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */ + + PG_RETURN_BYTEA_P(result); + } + + /* Else, it's the traditional escaped style */ + result = (bytea *) palloc(len + VARHDRSZ); /* maximum possible length */ + + tp = inputText; + rp = VARDATA(result); + while (*tp != '\0') + { + if (tp[0] != '\\') + *rp++ = *tp++; + else if ((tp[1] >= '0' && tp[1] <= '3') && + (tp[2] >= '0' && tp[2] <= '7') && + (tp[3] >= '0' && tp[3] <= '7')) + { + int v; + + v = VAL(tp[1]); + v <<= 3; + v += VAL(tp[2]); + v <<= 3; + *rp++ = v + VAL(tp[3]); + + tp += 4; + } + else if (tp[1] == '\\') + { + *rp++ = '\\'; + tp += 2; + } + else + { + /* + * one backslash, not followed by another or ### valid octal + */ + ereturn(escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "bytea"))); + } + } + + bc = rp - VARDATA(result); /* actual length */ + SET_VARSIZE(result, bc + VARHDRSZ); + + PG_RETURN_BYTEA_P(result); +} + +/* + * byteaout - converts to printable representation of byte array + * + * In the traditional escaped format, non-printable characters are + * printed as '\nnn' (octal) and '\' as '\\'. + */ +Datum +byteaout(PG_FUNCTION_ARGS) +{ + bytea *vlena = PG_GETARG_BYTEA_PP(0); + char *result; + char *rp; + + if (bytea_output == BYTEA_OUTPUT_HEX) + { + /* Print hex format */ + rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1); + *rp++ = '\\'; + *rp++ = 'x'; + rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp); + } + else if (bytea_output == BYTEA_OUTPUT_ESCAPE) + { + /* Print traditional escaped format */ + char *vp; + uint64 len; + int i; + + len = 1; /* empty string has 1 char */ + vp = VARDATA_ANY(vlena); + for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++) + { + if (*vp == '\\') + len += 2; + else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e) + len += 4; + else + len++; + } + + /* + * In principle len can't overflow uint32 if the input fit in 1GB, but + * for safety let's check rather than relying on palloc's internal + * check. + */ + if (len > MaxAllocSize) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg_internal("result of bytea output conversion is too large"))); + rp = result = (char *) palloc(len); + + vp = VARDATA_ANY(vlena); + for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++) + { + if (*vp == '\\') + { + *rp++ = '\\'; + *rp++ = '\\'; + } + else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e) + { + int val; /* holds unprintable chars */ + + val = *vp; + rp[0] = '\\'; + rp[3] = DIG(val & 07); + val >>= 3; + rp[2] = DIG(val & 07); + val >>= 3; + rp[1] = DIG(val & 03); + rp += 4; + } + else + *rp++ = *vp; + } + } + else + { + elog(ERROR, "unrecognized \"bytea_output\" setting: %d", + bytea_output); + rp = result = NULL; /* keep compiler quiet */ + } + *rp = '\0'; + PG_RETURN_CSTRING(result); +} + +/* + * bytearecv - converts external binary format to bytea + */ +Datum +bytearecv(PG_FUNCTION_ARGS) +{ + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + bytea *result; + int nbytes; + + nbytes = buf->len - buf->cursor; + result = (bytea *) palloc(nbytes + VARHDRSZ); + SET_VARSIZE(result, nbytes + VARHDRSZ); + pq_copymsgbytes(buf, VARDATA(result), nbytes); + PG_RETURN_BYTEA_P(result); +} + +/* + * byteasend - converts bytea to binary format + * + * This is a special case: just copy the input... + */ +Datum +byteasend(PG_FUNCTION_ARGS) +{ + bytea *vlena = PG_GETARG_BYTEA_P_COPY(0); + + PG_RETURN_BYTEA_P(vlena); +} + +Datum +bytea_string_agg_transfn(PG_FUNCTION_ARGS) +{ + StringInfo state; + + state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); + + /* Append the value unless null, preceding it with the delimiter. */ + if (!PG_ARGISNULL(1)) + { + bytea *value = PG_GETARG_BYTEA_PP(1); + bool isfirst = false; + + /* + * You might think we can just throw away the first delimiter, however + * we must keep it as we may be a parallel worker doing partial + * aggregation building a state to send to the main process. We need + * to keep the delimiter of every aggregation so that the combine + * function can properly join up the strings of two separately + * partially aggregated results. The first delimiter is only stripped + * off in the final function. To know how much to strip off the front + * of the string, we store the length of the first delimiter in the + * StringInfo's cursor field, which we don't otherwise need here. + */ + if (state == NULL) + { + MemoryContext aggcontext; + MemoryContext oldcontext; + + if (!AggCheckCallContext(fcinfo, &aggcontext)) + { + /* cannot be called directly because of internal-type argument */ + elog(ERROR, "bytea_string_agg_transfn called in non-aggregate context"); + } + + /* + * Create state in aggregate context. It'll stay there across + * subsequent calls. + */ + oldcontext = MemoryContextSwitchTo(aggcontext); + state = makeStringInfo(); + MemoryContextSwitchTo(oldcontext); + + isfirst = true; + } + + if (!PG_ARGISNULL(2)) + { + bytea *delim = PG_GETARG_BYTEA_PP(2); + + appendBinaryStringInfo(state, VARDATA_ANY(delim), + VARSIZE_ANY_EXHDR(delim)); + if (isfirst) + state->cursor = VARSIZE_ANY_EXHDR(delim); + } + + appendBinaryStringInfo(state, VARDATA_ANY(value), + VARSIZE_ANY_EXHDR(value)); + } + + /* + * The transition type for string_agg() is declared to be "internal", + * which is a pass-by-value type the same size as a pointer. + */ + if (state) + PG_RETURN_POINTER(state); + PG_RETURN_NULL(); +} + +Datum +bytea_string_agg_finalfn(PG_FUNCTION_ARGS) +{ + StringInfo state; + + /* cannot be called directly because of internal-type argument */ + Assert(AggCheckCallContext(fcinfo, NULL)); + + state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); + + if (state != NULL) + { + /* As per comment in transfn, strip data before the cursor position */ + bytea *result; + int strippedlen = state->len - state->cursor; + + result = (bytea *) palloc(strippedlen + VARHDRSZ); + SET_VARSIZE(result, strippedlen + VARHDRSZ); + memcpy(VARDATA(result), &state->data[state->cursor], strippedlen); + PG_RETURN_BYTEA_P(result); + } + else + PG_RETURN_NULL(); +} + +/*------------------------------------------------------------- + * byteaoctetlen + * + * get the number of bytes contained in an instance of type 'bytea' + *------------------------------------------------------------- + */ +Datum +byteaoctetlen(PG_FUNCTION_ARGS) +{ + Datum str = PG_GETARG_DATUM(0); + + /* We need not detoast the input at all */ + PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ); +} + +/* + * byteacat - + * takes two bytea* and returns a bytea* that is the concatenation of + * the two. + * + * Cloned from textcat and modified as required. + */ +Datum +byteacat(PG_FUNCTION_ARGS) +{ + bytea *t1 = PG_GETARG_BYTEA_PP(0); + bytea *t2 = PG_GETARG_BYTEA_PP(1); + + PG_RETURN_BYTEA_P(bytea_catenate(t1, t2)); +} + +/* + * byteaoverlay + * Replace specified substring of first string with second + * + * The SQL standard defines OVERLAY() in terms of substring and concatenation. + * This code is a direct implementation of what the standard says. + */ +Datum +byteaoverlay(PG_FUNCTION_ARGS) +{ + bytea *t1 = PG_GETARG_BYTEA_PP(0); + bytea *t2 = PG_GETARG_BYTEA_PP(1); + int sp = PG_GETARG_INT32(2); /* substring start position */ + int sl = PG_GETARG_INT32(3); /* substring length */ + + PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl)); +} + +Datum +byteaoverlay_no_len(PG_FUNCTION_ARGS) +{ + bytea *t1 = PG_GETARG_BYTEA_PP(0); + bytea *t2 = PG_GETARG_BYTEA_PP(1); + int sp = PG_GETARG_INT32(2); /* substring start position */ + int sl; + + sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */ + PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl)); +} + +/* + * bytea_substr() + * Return a substring starting at the specified position. + * Cloned from text_substr and modified as required. + * + * Input: + * - string + * - starting position (is one-based) + * - string length (optional) + * + * If the starting position is zero or less, then return from the start of the string + * adjusting the length to be consistent with the "negative start" per SQL. + * If the length is less than zero, an ERROR is thrown. If no third argument + * (length) is provided, the length to the end of the string is assumed. + */ +Datum +bytea_substr(PG_FUNCTION_ARGS) +{ + PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0), + PG_GETARG_INT32(1), + PG_GETARG_INT32(2), + false)); +} + +/* + * bytea_substr_no_len - + * Wrapper to avoid opr_sanity failure due to + * one function accepting a different number of args. + */ +Datum +bytea_substr_no_len(PG_FUNCTION_ARGS) +{ + PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0), + PG_GETARG_INT32(1), + -1, + true)); +} + +/* + * bit_count + */ +Datum +bytea_bit_count(PG_FUNCTION_ARGS) +{ + bytea *t1 = PG_GETARG_BYTEA_PP(0); + + PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1))); +} + +/* + * byteapos - + * Return the position of the specified substring. + * Implements the SQL POSITION() function. + * Cloned from textpos and modified as required. + */ +Datum +byteapos(PG_FUNCTION_ARGS) +{ + bytea *t1 = PG_GETARG_BYTEA_PP(0); + bytea *t2 = PG_GETARG_BYTEA_PP(1); + int pos; + int px, + p; + int len1, + len2; + char *p1, + *p2; + + len1 = VARSIZE_ANY_EXHDR(t1); + len2 = VARSIZE_ANY_EXHDR(t2); + + if (len2 <= 0) + PG_RETURN_INT32(1); /* result for empty pattern */ + + p1 = VARDATA_ANY(t1); + p2 = VARDATA_ANY(t2); + + pos = 0; + px = (len1 - len2); + for (p = 0; p <= px; p++) + { + if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0)) + { + pos = p + 1; + break; + }; + p1++; + }; + + PG_RETURN_INT32(pos); +} + +/*------------------------------------------------------------- + * byteaGetByte + * + * this routine treats "bytea" as an array of bytes. + * It returns the Nth byte (a number between 0 and 255). + *------------------------------------------------------------- + */ +Datum +byteaGetByte(PG_FUNCTION_ARGS) +{ + bytea *v = PG_GETARG_BYTEA_PP(0); + int32 n = PG_GETARG_INT32(1); + int len; + int byte; + + len = VARSIZE_ANY_EXHDR(v); + + if (n < 0 || n >= len) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("index %d out of valid range, 0..%d", + n, len - 1))); + + byte = ((unsigned char *) VARDATA_ANY(v))[n]; + + PG_RETURN_INT32(byte); +} + +/*------------------------------------------------------------- + * byteaGetBit + * + * This routine treats a "bytea" type like an array of bits. + * It returns the value of the Nth bit (0 or 1). + * + *------------------------------------------------------------- + */ +Datum +byteaGetBit(PG_FUNCTION_ARGS) +{ + bytea *v = PG_GETARG_BYTEA_PP(0); + int64 n = PG_GETARG_INT64(1); + int byteNo, + bitNo; + int len; + int byte; + + len = VARSIZE_ANY_EXHDR(v); + + if (n < 0 || n >= (int64) len * 8) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("index %" PRId64 " out of valid range, 0..%" PRId64, + n, (int64) len * 8 - 1))); + + /* n/8 is now known < len, so safe to cast to int */ + byteNo = (int) (n / 8); + bitNo = (int) (n % 8); + + byte = ((unsigned char *) VARDATA_ANY(v))[byteNo]; + + if (byte & (1 << bitNo)) + PG_RETURN_INT32(1); + else + PG_RETURN_INT32(0); +} + +/*------------------------------------------------------------- + * byteaSetByte + * + * Given an instance of type 'bytea' creates a new one with + * the Nth byte set to the given value. + * + *------------------------------------------------------------- + */ +Datum +byteaSetByte(PG_FUNCTION_ARGS) +{ + bytea *res = PG_GETARG_BYTEA_P_COPY(0); + int32 n = PG_GETARG_INT32(1); + int32 newByte = PG_GETARG_INT32(2); + int len; + + len = VARSIZE(res) - VARHDRSZ; + + if (n < 0 || n >= len) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("index %d out of valid range, 0..%d", + n, len - 1))); + + /* + * Now set the byte. + */ + ((unsigned char *) VARDATA(res))[n] = newByte; + + PG_RETURN_BYTEA_P(res); +} + +/*------------------------------------------------------------- + * byteaSetBit + * + * Given an instance of type 'bytea' creates a new one with + * the Nth bit set to the given value. + * + *------------------------------------------------------------- + */ +Datum +byteaSetBit(PG_FUNCTION_ARGS) +{ + bytea *res = PG_GETARG_BYTEA_P_COPY(0); + int64 n = PG_GETARG_INT64(1); + int32 newBit = PG_GETARG_INT32(2); + int len; + int oldByte, + newByte; + int byteNo, + bitNo; + + len = VARSIZE(res) - VARHDRSZ; + + if (n < 0 || n >= (int64) len * 8) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("index %" PRId64 " out of valid range, 0..%" PRId64, + n, (int64) len * 8 - 1))); + + /* n/8 is now known < len, so safe to cast to int */ + byteNo = (int) (n / 8); + bitNo = (int) (n % 8); + + /* + * sanity check! + */ + if (newBit != 0 && newBit != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("new bit must be 0 or 1"))); + + /* + * Update the byte. + */ + oldByte = ((unsigned char *) VARDATA(res))[byteNo]; + + if (newBit == 0) + newByte = oldByte & (~(1 << bitNo)); + else + newByte = oldByte | (1 << bitNo); + + ((unsigned char *) VARDATA(res))[byteNo] = newByte; + + PG_RETURN_BYTEA_P(res); +} + +/* + * Return reversed bytea + */ +Datum +bytea_reverse(PG_FUNCTION_ARGS) +{ + bytea *v = PG_GETARG_BYTEA_PP(0); + const char *p = VARDATA_ANY(v); + int len = VARSIZE_ANY_EXHDR(v); + const char *endp = p + len; + bytea *result = palloc(len + VARHDRSZ); + char *dst = (char *) VARDATA(result) + len; + + SET_VARSIZE(result, len + VARHDRSZ); + + while (p < endp) + *(--dst) = *p++; + + PG_RETURN_BYTEA_P(result); +} + + +/***************************************************************************** + * Comparison Functions used for bytea + * + * Note: btree indexes need these routines not to leak memory; therefore, + * be careful to free working copies of toasted datums. Most places don't + * need to be so careful. + *****************************************************************************/ + +Datum +byteaeq(PG_FUNCTION_ARGS) +{ + Datum arg1 = PG_GETARG_DATUM(0); + Datum arg2 = PG_GETARG_DATUM(1); + bool result; + Size len1, + len2; + + /* + * We can use a fast path for unequal lengths, which might save us from + * having to detoast one or both values. + */ + len1 = toast_raw_datum_size(arg1); + len2 = toast_raw_datum_size(arg2); + if (len1 != len2) + result = false; + else + { + bytea *barg1 = DatumGetByteaPP(arg1); + bytea *barg2 = DatumGetByteaPP(arg2); + + result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2), + len1 - VARHDRSZ) == 0); + + PG_FREE_IF_COPY(barg1, 0); + PG_FREE_IF_COPY(barg2, 1); + } + + PG_RETURN_BOOL(result); +} + +Datum +byteane(PG_FUNCTION_ARGS) +{ + Datum arg1 = PG_GETARG_DATUM(0); + Datum arg2 = PG_GETARG_DATUM(1); + bool result; + Size len1, + len2; + + /* + * We can use a fast path for unequal lengths, which might save us from + * having to detoast one or both values. + */ + len1 = toast_raw_datum_size(arg1); + len2 = toast_raw_datum_size(arg2); + if (len1 != len2) + result = true; + else + { + bytea *barg1 = DatumGetByteaPP(arg1); + bytea *barg2 = DatumGetByteaPP(arg2); + + result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2), + len1 - VARHDRSZ) != 0); + + PG_FREE_IF_COPY(barg1, 0); + PG_FREE_IF_COPY(barg2, 1); + } + + PG_RETURN_BOOL(result); +} + +Datum +bytealt(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + + PG_FREE_IF_COPY(arg1, 0); + PG_FREE_IF_COPY(arg2, 1); + + PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2))); +} + +Datum +byteale(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + + PG_FREE_IF_COPY(arg1, 0); + PG_FREE_IF_COPY(arg2, 1); + + PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2))); +} + +Datum +byteagt(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + + PG_FREE_IF_COPY(arg1, 0); + PG_FREE_IF_COPY(arg2, 1); + + PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2))); +} + +Datum +byteage(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + + PG_FREE_IF_COPY(arg1, 0); + PG_FREE_IF_COPY(arg2, 1); + + PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2))); +} + +Datum +byteacmp(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + if ((cmp == 0) && (len1 != len2)) + cmp = (len1 < len2) ? -1 : 1; + + PG_FREE_IF_COPY(arg1, 0); + PG_FREE_IF_COPY(arg2, 1); + + PG_RETURN_INT32(cmp); +} + +Datum +bytea_larger(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + bytea *result; + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + result = ((cmp > 0) || ((cmp == 0) && (len1 > len2)) ? arg1 : arg2); + + PG_RETURN_BYTEA_P(result); +} + +Datum +bytea_smaller(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + bytea *result; + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + result = ((cmp < 0) || ((cmp == 0) && (len1 < len2)) ? arg1 : arg2); + + PG_RETURN_BYTEA_P(result); +} + +Datum +bytea_sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); + + /* Use generic string SortSupport, forcing "C" collation */ + varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID); + + MemoryContextSwitchTo(oldcontext); + + PG_RETURN_VOID(); +} + +/* Cast bytea -> int2 */ +Datum +bytea_int2(PG_FUNCTION_ARGS) +{ + bytea *v = PG_GETARG_BYTEA_PP(0); + int len = VARSIZE_ANY_EXHDR(v); + uint16 result; + + /* Check that the byte array is not too long */ + if (len > sizeof(result)) + ereport(ERROR, + errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("smallint out of range")); + + /* Convert it to an integer; most significant bytes come first */ + result = 0; + for (int i = 0; i < len; i++) + { + result <<= BITS_PER_BYTE; + result |= ((unsigned char *) VARDATA_ANY(v))[i]; + } + + PG_RETURN_INT16(result); +} + +/* Cast bytea -> int4 */ +Datum +bytea_int4(PG_FUNCTION_ARGS) +{ + bytea *v = PG_GETARG_BYTEA_PP(0); + int len = VARSIZE_ANY_EXHDR(v); + uint32 result; + + /* Check that the byte array is not too long */ + if (len > sizeof(result)) + ereport(ERROR, + errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("integer out of range")); + + /* Convert it to an integer; most significant bytes come first */ + result = 0; + for (int i = 0; i < len; i++) + { + result <<= BITS_PER_BYTE; + result |= ((unsigned char *) VARDATA_ANY(v))[i]; + } + + PG_RETURN_INT32(result); +} + +/* Cast bytea -> int8 */ +Datum +bytea_int8(PG_FUNCTION_ARGS) +{ + bytea *v = PG_GETARG_BYTEA_PP(0); + int len = VARSIZE_ANY_EXHDR(v); + uint64 result; + + /* Check that the byte array is not too long */ + if (len > sizeof(result)) + ereport(ERROR, + errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("bigint out of range")); + + /* Convert it to an integer; most significant bytes come first */ + result = 0; + for (int i = 0; i < len; i++) + { + result <<= BITS_PER_BYTE; + result |= ((unsigned char *) VARDATA_ANY(v))[i]; + } + + PG_RETURN_INT64(result); +} + +/* Cast int2 -> bytea; can just use int2send() */ +Datum +int2_bytea(PG_FUNCTION_ARGS) +{ + return int2send(fcinfo); +} + +/* Cast int4 -> bytea; can just use int4send() */ +Datum +int4_bytea(PG_FUNCTION_ARGS) +{ + return int4send(fcinfo); +} + +/* Cast int8 -> bytea; can just use int8send() */ +Datum +int8_bytea(PG_FUNCTION_ARGS) +{ + return int8send(fcinfo); +} diff --git a/src/backend/utils/adt/date.c b/src/backend/utils/adt/date.c index 4227ab1a72bfb..344f58b92f7a2 100644 --- a/src/backend/utils/adt/date.c +++ b/src/backend/utils/adt/date.c @@ -1363,10 +1363,35 @@ timestamp_date(PG_FUNCTION_ARGS) { Timestamp timestamp = PG_GETARG_TIMESTAMP(0); DateADT result; + + result = timestamp2date_opt_overflow(timestamp, NULL); + PG_RETURN_DATEADT(result); +} + +/* + * Convert timestamp to date. + * + * On successful conversion, *overflow is set to zero if it's not NULL. + * + * If the timestamp is finite but out of the valid range for date, then: + * if overflow is NULL, we throw an out-of-range error. + * if overflow is not NULL, we store +1 or -1 there to indicate the sign + * of the overflow, and return the appropriate date infinity. + * + * Note: given the ranges of the types, overflow is only possible at + * the minimum end of the range, but we don't assume that in this code. + */ +DateADT +timestamp2date_opt_overflow(Timestamp timestamp, int *overflow) +{ + DateADT result; struct pg_tm tt, *tm = &tt; fsec_t fsec; + if (overflow) + *overflow = 0; + if (TIMESTAMP_IS_NOBEGIN(timestamp)) DATE_NOBEGIN(result); else if (TIMESTAMP_IS_NOEND(timestamp)) @@ -1374,14 +1399,30 @@ timestamp_date(PG_FUNCTION_ARGS) else { if (timestamp2tm(timestamp, NULL, tm, &fsec, NULL, NULL) != 0) + { + if (overflow) + { + if (timestamp < 0) + { + *overflow = -1; + DATE_NOBEGIN(result); + } + else + { + *overflow = 1; /* not actually reachable */ + DATE_NOEND(result); + } + return result; + } ereport(ERROR, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("timestamp out of range"))); + } result = date2j(tm->tm_year, tm->tm_mon, tm->tm_mday) - POSTGRES_EPOCH_JDATE; } - PG_RETURN_DATEADT(result); + return result; } @@ -1408,11 +1449,36 @@ timestamptz_date(PG_FUNCTION_ARGS) { TimestampTz timestamp = PG_GETARG_TIMESTAMP(0); DateADT result; + + result = timestamptz2date_opt_overflow(timestamp, NULL); + PG_RETURN_DATEADT(result); +} + +/* + * Convert timestamptz to date. + * + * On successful conversion, *overflow is set to zero if it's not NULL. + * + * If the timestamptz is finite but out of the valid range for date, then: + * if overflow is NULL, we throw an out-of-range error. + * if overflow is not NULL, we store +1 or -1 there to indicate the sign + * of the overflow, and return the appropriate date infinity. + * + * Note: given the ranges of the types, overflow is only possible at + * the minimum end of the range, but we don't assume that in this code. + */ +DateADT +timestamptz2date_opt_overflow(TimestampTz timestamp, int *overflow) +{ + DateADT result; struct pg_tm tt, *tm = &tt; fsec_t fsec; int tz; + if (overflow) + *overflow = 0; + if (TIMESTAMP_IS_NOBEGIN(timestamp)) DATE_NOBEGIN(result); else if (TIMESTAMP_IS_NOEND(timestamp)) @@ -1420,14 +1486,30 @@ timestamptz_date(PG_FUNCTION_ARGS) else { if (timestamp2tm(timestamp, &tz, tm, &fsec, NULL, NULL) != 0) + { + if (overflow) + { + if (timestamp < 0) + { + *overflow = -1; + DATE_NOBEGIN(result); + } + else + { + *overflow = 1; /* not actually reachable */ + DATE_NOEND(result); + } + return result; + } ereport(ERROR, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("timestamp out of range"))); + } result = date2j(tm->tm_year, tm->tm_mon, tm->tm_mday) - POSTGRES_EPOCH_JDATE; } - PG_RETURN_DATEADT(result); + return result; } diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c index ba66a9c4ce63a..7b97d2be6caed 100644 --- a/src/backend/utils/adt/float.c +++ b/src/backend/utils/adt/float.c @@ -4067,8 +4067,9 @@ float84ge(PG_FUNCTION_ARGS) * with the specified characteristics. An operand smaller than the * lower bound is assigned to bucket 0. An operand greater than or equal * to the upper bound is assigned to an additional bucket (with number - * count+1). We don't allow "NaN" for any of the float8 inputs, and we - * don't allow either of the histogram bounds to be +/- infinity. + * count+1). We don't allow the histogram bounds to be NaN or +/- infinity, + * but we do allow those values for the operand (taking NaN to be larger + * than any other value, as we do in comparisons). */ Datum width_bucket_float8(PG_FUNCTION_ARGS) @@ -4084,12 +4085,11 @@ width_bucket_float8(PG_FUNCTION_ARGS) (errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), errmsg("count must be greater than zero"))); - if (isnan(operand) || isnan(bound1) || isnan(bound2)) + if (isnan(bound1) || isnan(bound2)) ereport(ERROR, (errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), - errmsg("operand, lower bound, and upper bound cannot be NaN"))); + errmsg("lower and upper bounds cannot be NaN"))); - /* Note that we allow "operand" to be infinite */ if (isinf(bound1) || isinf(bound2)) ereport(ERROR, (errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), @@ -4097,15 +4097,15 @@ width_bucket_float8(PG_FUNCTION_ARGS) if (bound1 < bound2) { - if (operand < bound1) - result = 0; - else if (operand >= bound2) + if (isnan(operand) || operand >= bound2) { if (pg_add_s32_overflow(count, 1, &result)) ereport(ERROR, (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), errmsg("integer out of range"))); } + else if (operand < bound1) + result = 0; else { if (!isinf(bound2 - bound1)) @@ -4135,7 +4135,7 @@ width_bucket_float8(PG_FUNCTION_ARGS) } else if (bound1 > bound2) { - if (operand > bound1) + if (isnan(operand) || operand > bound1) result = 0; else if (operand <= bound2) { diff --git a/src/backend/utils/adt/inet_net_pton.c b/src/backend/utils/adt/inet_net_pton.c index ef2236d9f0430..3b0db2a379937 100644 --- a/src/backend/utils/adt/inet_net_pton.c +++ b/src/backend/utils/adt/inet_net_pton.c @@ -115,8 +115,7 @@ inet_cidr_pton_ipv4(const char *src, u_char *dst, size_t size) src++; /* skip x or X. */ while ((ch = *src++) != '\0' && isxdigit((unsigned char) ch)) { - if (isupper((unsigned char) ch)) - ch = tolower((unsigned char) ch); + ch = pg_ascii_tolower((unsigned char) ch); n = strchr(xdigits, ch) - xdigits; assert(n >= 0 && n <= 15); if (dirty == 0) diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c index c8b6c15e05975..82b807d067a34 100644 --- a/src/backend/utils/adt/jsonb_util.c +++ b/src/backend/utils/adt/jsonb_util.c @@ -277,22 +277,16 @@ compareJsonbContainers(JsonbContainer *a, JsonbContainer *b) else { /* - * It's safe to assume that the types differed, and that the va - * and vb values passed were set. - * - * If the two values were of the same container type, then there'd - * have been a chance to observe the variation in the number of - * elements/pairs (when processing WJB_BEGIN_OBJECT, say). They're - * either two heterogeneously-typed containers, or a container and - * some scalar type. - * - * We don't have to consider the WJB_END_ARRAY and WJB_END_OBJECT - * cases here, because we would have seen the corresponding - * WJB_BEGIN_ARRAY and WJB_BEGIN_OBJECT tokens first, and - * concluded that they don't match. + * It's not possible for one iterator to report end of array or + * object while the other one reports something else, because we + * would have detected a length mismatch when we processed the + * container-start tokens above. Likewise we can't see WJB_DONE + * from one but not the other. So we have two different-type + * containers, or a container and some scalar type, or two + * different scalar types. Sort on the basis of the type code. */ - Assert(ra != WJB_END_ARRAY && ra != WJB_END_OBJECT); - Assert(rb != WJB_END_ARRAY && rb != WJB_END_OBJECT); + Assert(ra != WJB_DONE && ra != WJB_END_ARRAY && ra != WJB_END_OBJECT); + Assert(rb != WJB_DONE && rb != WJB_END_ARRAY && rb != WJB_END_OBJECT); Assert(va.type != vb.type); Assert(va.type != jbvBinary); @@ -852,15 +846,20 @@ JsonbIteratorInit(JsonbContainer *container) * It is our job to expand the jbvBinary representation without bothering them * with it. However, clients should not take it upon themselves to touch array * or Object element/pair buffers, since their element/pair pointers are - * garbage. Also, *val will not be set when returning WJB_END_ARRAY or - * WJB_END_OBJECT, on the assumption that it's only useful to access values - * when recursing in. + * garbage. + * + * *val is not meaningful when the result is WJB_DONE, WJB_END_ARRAY or + * WJB_END_OBJECT. However, we set val->type = jbvNull in those cases, + * so that callers may assume that val->type is always well-defined. */ JsonbIteratorToken JsonbIteratorNext(JsonbIterator **it, JsonbValue *val, bool skipNested) { if (*it == NULL) + { + val->type = jbvNull; return WJB_DONE; + } /* * When stepping into a nested container, we jump back here to start @@ -898,6 +897,7 @@ JsonbIteratorNext(JsonbIterator **it, JsonbValue *val, bool skipNested) * nesting). */ *it = freeAndGetParent(*it); + val->type = jbvNull; return WJB_END_ARRAY; } @@ -951,6 +951,7 @@ JsonbIteratorNext(JsonbIterator **it, JsonbValue *val, bool skipNested) * of nesting). */ *it = freeAndGetParent(*it); + val->type = jbvNull; return WJB_END_OBJECT; } else @@ -995,8 +996,10 @@ JsonbIteratorNext(JsonbIterator **it, JsonbValue *val, bool skipNested) return WJB_VALUE; } - elog(ERROR, "invalid iterator state"); - return -1; + elog(ERROR, "invalid jsonb iterator state"); + /* satisfy compilers that don't know that elog(ERROR) doesn't return */ + val->type = jbvNull; + return WJB_DONE; } /* diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c index 7f4cf6145854a..4216ac17f4371 100644 --- a/src/backend/utils/adt/like.c +++ b/src/backend/utils/adt/like.c @@ -98,7 +98,7 @@ SB_lower_char(unsigned char c, pg_locale_t locale) else if (locale->is_default) return pg_tolower(c); else - return tolower_l(c, locale->info.lt); + return char_tolower(c, locale); } @@ -209,7 +209,17 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation) * way. */ - if (pg_database_encoding_max_length() > 1 || (locale->provider == COLLPROVIDER_ICU)) + if (locale->ctype_is_c || + (char_tolower_enabled(locale) && + pg_database_encoding_max_length() == 1)) + { + p = VARDATA_ANY(pat); + plen = VARSIZE_ANY_EXHDR(pat); + s = VARDATA_ANY(str); + slen = VARSIZE_ANY_EXHDR(str); + return SB_IMatchText(s, slen, p, plen, locale); + } + else { pat = DatumGetTextPP(DirectFunctionCall1Coll(lower, collation, PointerGetDatum(pat))); @@ -224,14 +234,6 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation) else return MB_MatchText(s, slen, p, plen, 0); } - else - { - p = VARDATA_ANY(pat); - plen = VARSIZE_ANY_EXHDR(pat); - s = VARDATA_ANY(str); - slen = VARSIZE_ANY_EXHDR(str); - return SB_IMatchText(s, slen, p, plen, locale); - } } /* diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c index 8fdc677371f4d..999f23f86d51d 100644 --- a/src/backend/utils/adt/like_support.c +++ b/src/backend/utils/adt/like_support.c @@ -1495,13 +1495,8 @@ pattern_char_isalpha(char c, bool is_multibyte, { if (locale->ctype_is_c) return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); - else if (is_multibyte && IS_HIGHBIT_SET(c)) - return true; - else if (locale->provider != COLLPROVIDER_LIBC) - return IS_HIGHBIT_SET(c) || - (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); else - return isalpha_l((unsigned char) c, locale->info.lt); + return char_is_cased(c, locale); } diff --git a/src/backend/utils/adt/meson.build b/src/backend/utils/adt/meson.build index 244f48f4fd711..ed9bbd7b9266b 100644 --- a/src/backend/utils/adt/meson.build +++ b/src/backend/utils/adt/meson.build @@ -12,6 +12,7 @@ backend_sources += files( 'arrayutils.c', 'ascii.c', 'bool.c', + 'bytea.c', 'cash.c', 'char.c', 'cryptohashfuncs.c', diff --git a/src/backend/utils/adt/network.c b/src/backend/utils/adt/network.c index f03fcc1147bb0..9fd211b2d4576 100644 --- a/src/backend/utils/adt/network.c +++ b/src/backend/utils/adt/network.c @@ -12,8 +12,6 @@ #include #include -#include "access/stratnum.h" -#include "catalog/pg_opfamily.h" #include "catalog/pg_type.h" #include "common/hashfn.h" #include "common/ip.h" diff --git a/src/backend/utils/adt/network_spgist.c b/src/backend/utils/adt/network_spgist.c index a84747d927586..602276a35c3ea 100644 --- a/src/backend/utils/adt/network_spgist.c +++ b/src/backend/utils/adt/network_spgist.c @@ -37,7 +37,6 @@ #include "catalog/pg_type.h" #include "utils/fmgrprotos.h" #include "utils/inet.h" -#include "varatt.h" static int inet_spg_node_number(const inet *val, int commonbits); diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c index 58ad1a65ef7b1..c9233565d57a7 100644 --- a/src/backend/utils/adt/numeric.c +++ b/src/backend/utils/adt/numeric.c @@ -1960,8 +1960,9 @@ generate_series_numeric_support(PG_FUNCTION_ARGS) * with the specified characteristics. An operand smaller than the * lower bound is assigned to bucket 0. An operand greater than or equal * to the upper bound is assigned to an additional bucket (with number - * count+1). We don't allow "NaN" for any of the numeric inputs, and we - * don't allow either of the histogram bounds to be +/- infinity. + * count+1). We don't allow the histogram bounds to be NaN or +/- infinity, + * but we do allow those values for the operand (taking NaN to be larger + * than any other value, as we do in comparisons). */ Datum width_bucket_numeric(PG_FUNCTION_ARGS) @@ -1979,17 +1980,13 @@ width_bucket_numeric(PG_FUNCTION_ARGS) (errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), errmsg("count must be greater than zero"))); - if (NUMERIC_IS_SPECIAL(operand) || - NUMERIC_IS_SPECIAL(bound1) || - NUMERIC_IS_SPECIAL(bound2)) + if (NUMERIC_IS_SPECIAL(bound1) || NUMERIC_IS_SPECIAL(bound2)) { - if (NUMERIC_IS_NAN(operand) || - NUMERIC_IS_NAN(bound1) || - NUMERIC_IS_NAN(bound2)) + if (NUMERIC_IS_NAN(bound1) || NUMERIC_IS_NAN(bound2)) ereport(ERROR, (errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), - errmsg("operand, lower bound, and upper bound cannot be NaN"))); - /* We allow "operand" to be infinite; cmp_numerics will cope */ + errmsg("lower and upper bounds cannot be NaN"))); + if (NUMERIC_IS_INF(bound1) || NUMERIC_IS_INF(bound2)) ereport(ERROR, (errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index f5e31c433a0de..97c2ac1faf9a4 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -41,7 +41,6 @@ #include "mb/pg_wchar.h" #include "miscadmin.h" #include "utils/builtins.h" -#include "utils/formatting.h" #include "utils/guc_hooks.h" #include "utils/lsyscache.h" #include "utils/memutils.h" @@ -80,31 +79,6 @@ extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context); extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context); extern char *get_collation_actual_version_libc(const char *collcollate); -extern size_t strlower_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - -extern size_t strlower_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strfold_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - -extern size_t strlower_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - /* GUC settings */ char *locale_messages; char *locale_monetary; @@ -1093,6 +1067,9 @@ create_pg_locale(Oid collid, MemoryContext context) Assert((result->collate_is_c && result->collate == NULL) || (!result->collate_is_c && result->collate != NULL)); + Assert((result->ctype_is_c && result->ctype == NULL) || + (!result->ctype_is_c && result->ctype != NULL)); + datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion, &isnull); if (!isnull) @@ -1257,77 +1234,31 @@ size_t pg_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_BUILTIN) - return strlower_builtin(dst, dstsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - return strlower_icu(dst, dstsize, src, srclen, locale); -#endif - else if (locale->provider == COLLPROVIDER_LIBC) - return strlower_libc(dst, dstsize, src, srclen, locale); - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return 0; /* keep compiler quiet */ + return locale->ctype->strlower(dst, dstsize, src, srclen, locale); } size_t pg_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_BUILTIN) - return strtitle_builtin(dst, dstsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - return strtitle_icu(dst, dstsize, src, srclen, locale); -#endif - else if (locale->provider == COLLPROVIDER_LIBC) - return strtitle_libc(dst, dstsize, src, srclen, locale); - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return 0; /* keep compiler quiet */ + return locale->ctype->strtitle(dst, dstsize, src, srclen, locale); } size_t pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_BUILTIN) - return strupper_builtin(dst, dstsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - return strupper_icu(dst, dstsize, src, srclen, locale); -#endif - else if (locale->provider == COLLPROVIDER_LIBC) - return strupper_libc(dst, dstsize, src, srclen, locale); - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return 0; /* keep compiler quiet */ + return locale->ctype->strupper(dst, dstsize, src, srclen, locale); } size_t pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_BUILTIN) - return strfold_builtin(dst, dstsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - return strfold_icu(dst, dstsize, src, srclen, locale); -#endif - /* for libc, just use strlower */ - else if (locale->provider == COLLPROVIDER_LIBC) - return strlower_libc(dst, dstsize, src, srclen, locale); + if (locale->ctype->strfold) + return locale->ctype->strfold(dst, dstsize, src, srclen, locale); else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return 0; /* keep compiler quiet */ + return locale->ctype->strlower(dst, dstsize, src, srclen, locale); } /* @@ -1464,6 +1395,41 @@ pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, return locale->collate->strnxfrm_prefix(dest, destsize, src, srclen, locale); } +/* + * char_is_cased() + * + * Fuzzy test of whether the given char is case-varying or not. The argument + * is a single byte, so in a multibyte encoding, just assume any non-ASCII + * char is case-varying. + */ +bool +char_is_cased(char ch, pg_locale_t locale) +{ + return locale->ctype->char_is_cased(ch, locale); +} + +/* + * char_tolower_enabled() + * + * Does the provider support char_tolower()? + */ +bool +char_tolower_enabled(pg_locale_t locale) +{ + return (locale->ctype->char_tolower != NULL); +} + +/* + * char_tolower() + * + * Convert char (single-byte encoding) to lowercase. + */ +char +char_tolower(unsigned char ch, pg_locale_t locale) +{ + return locale->ctype->char_tolower(ch, locale); +} + /* * Return required encoding ID for the given locale, or -1 if any encoding is * valid for the locale. diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index f51768830cd7b..0c9fbdb40f2a9 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -18,22 +18,12 @@ #include "mb/pg_wchar.h" #include "miscadmin.h" #include "utils/builtins.h" -#include "utils/memutils.h" #include "utils/pg_locale.h" #include "utils/syscache.h" extern pg_locale_t create_pg_locale_builtin(Oid collid, MemoryContext context); extern char *get_collation_actual_version_builtin(const char *collcollate); -extern size_t strlower_builtin(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_builtin(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_builtin(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strfold_builtin(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); - struct WordBoundaryState { @@ -77,7 +67,7 @@ initcap_wbnext(void *state) return wbstate->len; } -size_t +static size_t strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -85,7 +75,7 @@ strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, locale->info.builtin.casemap_full); } -size_t +static size_t strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -103,7 +93,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, initcap_wbnext, &wbstate); } -size_t +static size_t strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -111,7 +101,7 @@ strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, locale->info.builtin.casemap_full); } -size_t +static size_t strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -119,6 +109,98 @@ strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, locale->info.builtin.casemap_full); } +static bool +wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isdigit(wc, !locale->info.builtin.casemap_full); +} + +static bool +wc_isalpha_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isalpha(wc); +} + +static bool +wc_isalnum_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isalnum(wc, !locale->info.builtin.casemap_full); +} + +static bool +wc_isupper_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isupper(wc); +} + +static bool +wc_islower_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_islower(wc); +} + +static bool +wc_isgraph_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isgraph(wc); +} + +static bool +wc_isprint_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isprint(wc); +} + +static bool +wc_ispunct_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_ispunct(wc, !locale->info.builtin.casemap_full); +} + +static bool +wc_isspace_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isspace(wc); +} + +static bool +char_is_cased_builtin(char ch, pg_locale_t locale) +{ + return IS_HIGHBIT_SET(ch) || + (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); +} + +static pg_wchar +wc_toupper_builtin(pg_wchar wc, pg_locale_t locale) +{ + return unicode_uppercase_simple(wc); +} + +static pg_wchar +wc_tolower_builtin(pg_wchar wc, pg_locale_t locale) +{ + return unicode_lowercase_simple(wc); +} + +static const struct ctype_methods ctype_methods_builtin = { + .strlower = strlower_builtin, + .strtitle = strtitle_builtin, + .strupper = strupper_builtin, + .strfold = strfold_builtin, + .wc_isdigit = wc_isdigit_builtin, + .wc_isalpha = wc_isalpha_builtin, + .wc_isalnum = wc_isalnum_builtin, + .wc_isupper = wc_isupper_builtin, + .wc_islower = wc_islower_builtin, + .wc_isgraph = wc_isgraph_builtin, + .wc_isprint = wc_isprint_builtin, + .wc_ispunct = wc_ispunct_builtin, + .wc_isspace = wc_isspace_builtin, + .char_is_cased = char_is_cased_builtin, + .wc_tolower = wc_tolower_builtin, + .wc_toupper = wc_toupper_builtin, +}; + pg_locale_t create_pg_locale_builtin(Oid collid, MemoryContext context) { @@ -158,10 +240,11 @@ create_pg_locale_builtin(Oid collid, MemoryContext context) result->info.builtin.locale = MemoryContextStrdup(context, locstr); result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0); - result->provider = COLLPROVIDER_BUILTIN; result->deterministic = true; result->collate_is_c = true; result->ctype_is_c = (strcmp(locstr, "C") == 0); + if (!result->ctype_is_c) + result->ctype = &ctype_methods_builtin; return result; } diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index a32c32a0744bd..96741e08269a4 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -48,19 +48,22 @@ #define TEXTBUFLEN 1024 extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context); -extern size_t strlower_icu(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_icu(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_icu(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strfold_icu(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); #ifdef USE_ICU extern UCollator *pg_ucol_open(const char *loc_str); +static size_t strlower_icu(char *dest, size_t destsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static size_t strtitle_icu(char *dest, size_t destsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static size_t strupper_icu(char *dest, size_t destsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static size_t strfold_icu(char *dest, size_t destsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static int strncoll_icu(const char *arg1, ssize_t len1, + const char *arg2, ssize_t len2, + pg_locale_t locale); static size_t strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); @@ -118,6 +121,25 @@ static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity, const char *locale, UErrorCode *pErrorCode); +static bool +char_is_cased_icu(char ch, pg_locale_t locale) +{ + return IS_HIGHBIT_SET(ch) || + (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); +} + +static pg_wchar +toupper_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_toupper(wc); +} + +static pg_wchar +tolower_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_tolower(wc); +} + static const struct collate_methods collate_methods_icu = { .strncoll = strncoll_icu, .strnxfrm = strnxfrm_icu, @@ -136,6 +158,78 @@ static const struct collate_methods collate_methods_icu_utf8 = { .strxfrm_is_safe = true, }; +static bool +wc_isdigit_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isdigit(wc); +} + +static bool +wc_isalpha_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isalpha(wc); +} + +static bool +wc_isalnum_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isalnum(wc); +} + +static bool +wc_isupper_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isupper(wc); +} + +static bool +wc_islower_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_islower(wc); +} + +static bool +wc_isgraph_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isgraph(wc); +} + +static bool +wc_isprint_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isprint(wc); +} + +static bool +wc_ispunct_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_ispunct(wc); +} + +static bool +wc_isspace_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isspace(wc); +} + +static const struct ctype_methods ctype_methods_icu = { + .strlower = strlower_icu, + .strtitle = strtitle_icu, + .strupper = strupper_icu, + .strfold = strfold_icu, + .wc_isdigit = wc_isdigit_icu, + .wc_isalpha = wc_isalpha_icu, + .wc_isalnum = wc_isalnum_icu, + .wc_isupper = wc_isupper_icu, + .wc_islower = wc_islower_icu, + .wc_isgraph = wc_isgraph_icu, + .wc_isprint = wc_isprint_icu, + .wc_ispunct = wc_ispunct_icu, + .wc_isspace = wc_isspace_icu, + .char_is_cased = char_is_cased_icu, + .wc_toupper = toupper_icu, + .wc_tolower = tolower_icu, +}; #endif pg_locale_t @@ -198,7 +292,6 @@ create_pg_locale_icu(Oid collid, MemoryContext context) result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); result->info.icu.locale = MemoryContextStrdup(context, iculocstr); result->info.icu.ucol = collator; - result->provider = COLLPROVIDER_ICU; result->deterministic = deterministic; result->collate_is_c = false; result->ctype_is_c = false; @@ -206,6 +299,7 @@ create_pg_locale_icu(Oid collid, MemoryContext context) result->collate = &collate_methods_icu_utf8; else result->collate = &collate_methods_icu; + result->ctype = &ctype_methods_icu; return result; #else @@ -379,7 +473,7 @@ make_icu_collator(const char *iculocstr, const char *icurules) } } -size_t +static size_t strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -399,7 +493,7 @@ strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return result_len; } -size_t +static size_t strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -419,7 +513,7 @@ strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return result_len; } -size_t +static size_t strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -439,7 +533,7 @@ strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return result_len; } -size_t +static size_t strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -474,8 +568,6 @@ strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2 int result; UErrorCode status; - Assert(locale->provider == COLLPROVIDER_ICU); - Assert(GetDatabaseEncoding() == PG_UTF8); status = U_ZERO_ERROR; @@ -503,8 +595,6 @@ strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, size_t uchar_bsize; Size result_bsize; - Assert(locale->provider == COLLPROVIDER_ICU); - init_icu_converter(); ulen = uchar_length(icu_converter, src, srclen); @@ -549,8 +639,6 @@ strnxfrm_prefix_icu_utf8(char *dest, size_t destsize, uint32_t state[2]; UErrorCode status; - Assert(locale->provider == COLLPROVIDER_ICU); - Assert(GetDatabaseEncoding() == PG_UTF8); uiter_setUTF8(&iter, src, srclen); @@ -749,8 +837,6 @@ strncoll_icu(const char *arg1, ssize_t len1, *uchar2; int result; - Assert(locale->provider == COLLPROVIDER_ICU); - /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */ #ifdef HAVE_UCOL_STRCOLLUTF8 Assert(GetDatabaseEncoding() != PG_UTF8); @@ -799,8 +885,6 @@ strnxfrm_prefix_icu(char *dest, size_t destsize, size_t uchar_bsize; Size result_bsize; - Assert(locale->provider == COLLPROVIDER_ICU); - /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */ Assert(GetDatabaseEncoding() != PG_UTF8); diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c index 199857e22dbec..8d88b53c37529 100644 --- a/src/backend/utils/adt/pg_locale_libc.c +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -33,6 +33,46 @@ #include #endif +/* + * For the libc provider, to provide as much functionality as possible on a + * variety of platforms without going so far as to implement everything from + * scratch, we use several implementation strategies depending on the + * situation: + * + * 1. In C/POSIX collations, we use hard-wired code. We can't depend on + * the functions since those will obey LC_CTYPE. Note that these + * collations don't give a fig about multibyte characters. + * + * 2. When working in UTF8 encoding, we use the functions. + * This assumes that every platform uses Unicode codepoints directly + * as the wchar_t representation of Unicode. (XXX: ICU makes this assumption + * even for non-UTF8 encodings, which may be a problem.) On some platforms + * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF. + * + * 3. In all other encodings, we use the functions for pg_wchar + * values up to 255, and punt for values above that. This is 100% correct + * only in single-byte encodings such as LATINn. However, non-Unicode + * multibyte encodings are mostly Far Eastern character sets for which the + * properties being tested here aren't very relevant for higher code values + * anyway. The difficulty with using the functions with + * non-Unicode multibyte encodings is that we can have no certainty that + * the platform's wchar_t representation matches what we do in pg_wchar + * conversions. + * + * As a special case, in the "default" collation, (2) and (3) force ASCII + * letters to follow ASCII upcase/downcase rules, while in a non-default + * collation we just let the library functions do what they will. The case + * where this matters is treatment of I/i in Turkish, and the behavior is + * meant to match the upper()/lower() SQL functions. + * + * We store the active collation setting in static variables. In principle + * it could be passed down to here via the regex library's "struct vars" data + * structure; but that would require somewhat invasive changes in the regex + * library, and right now there's no real benefit to be gained from that. + * + * NB: the coding here assumes pg_wchar is an unsigned type. + */ + /* * Size of stack buffer to use for string transformations, used to avoid heap * allocations in typical cases. This should be large enough that most strings @@ -43,13 +83,6 @@ extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context); -extern size_t strlower_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - static int strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale); @@ -85,6 +118,251 @@ static size_t strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); +static bool +wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isdigit_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isalpha_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isalnum_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isupper_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return islower_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isgraph_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isprint_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return ispunct_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isspace_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswdigit_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswalpha_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswalnum_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswupper_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswlower_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswgraph_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswprint_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswpunct_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswspace_l((wint_t) wc, locale->info.lt); +} + +static char +char_tolower_libc(unsigned char ch, pg_locale_t locale) +{ + Assert(pg_database_encoding_max_length() == 1); + return tolower_l(ch, locale->info.lt); +} + +static bool +char_is_cased_libc(char ch, pg_locale_t locale) +{ + bool is_multibyte = pg_database_encoding_max_length() > 1; + + if (is_multibyte && IS_HIGHBIT_SET(ch)) + return true; + else + return isalpha_l((unsigned char) ch, locale->info.lt); +} + +static pg_wchar +toupper_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() != PG_UTF8); + + /* force C behavior for ASCII characters, per comments above */ + if (locale->is_default && wc <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) wc); + if (wc <= (pg_wchar) UCHAR_MAX) + return toupper_l((unsigned char) wc, locale->info.lt); + else + return wc; +} + +static pg_wchar +toupper_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* force C behavior for ASCII characters, per comments above */ + if (locale->is_default && wc <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) wc); + if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF) + return towupper_l((wint_t) wc, locale->info.lt); + else + return wc; +} + +static pg_wchar +tolower_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() != PG_UTF8); + + /* force C behavior for ASCII characters, per comments above */ + if (locale->is_default && wc <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) wc); + if (wc <= (pg_wchar) UCHAR_MAX) + return tolower_l((unsigned char) wc, locale->info.lt); + else + return wc; +} + +static pg_wchar +tolower_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* force C behavior for ASCII characters, per comments above */ + if (locale->is_default && wc <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) wc); + if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF) + return towlower_l((wint_t) wc, locale->info.lt); + else + return wc; +} + +static const struct ctype_methods ctype_methods_libc_sb = { + .strlower = strlower_libc_sb, + .strtitle = strtitle_libc_sb, + .strupper = strupper_libc_sb, + .wc_isdigit = wc_isdigit_libc_sb, + .wc_isalpha = wc_isalpha_libc_sb, + .wc_isalnum = wc_isalnum_libc_sb, + .wc_isupper = wc_isupper_libc_sb, + .wc_islower = wc_islower_libc_sb, + .wc_isgraph = wc_isgraph_libc_sb, + .wc_isprint = wc_isprint_libc_sb, + .wc_ispunct = wc_ispunct_libc_sb, + .wc_isspace = wc_isspace_libc_sb, + .char_is_cased = char_is_cased_libc, + .char_tolower = char_tolower_libc, + .wc_toupper = toupper_libc_sb, + .wc_tolower = tolower_libc_sb, + .max_chr = UCHAR_MAX, +}; + +/* + * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but + * single-byte semantics for pattern matching. + */ +static const struct ctype_methods ctype_methods_libc_other_mb = { + .strlower = strlower_libc_mb, + .strtitle = strtitle_libc_mb, + .strupper = strupper_libc_mb, + .wc_isdigit = wc_isdigit_libc_sb, + .wc_isalpha = wc_isalpha_libc_sb, + .wc_isalnum = wc_isalnum_libc_sb, + .wc_isupper = wc_isupper_libc_sb, + .wc_islower = wc_islower_libc_sb, + .wc_isgraph = wc_isgraph_libc_sb, + .wc_isprint = wc_isprint_libc_sb, + .wc_ispunct = wc_ispunct_libc_sb, + .wc_isspace = wc_isspace_libc_sb, + .char_is_cased = char_is_cased_libc, + .char_tolower = char_tolower_libc, + .wc_toupper = toupper_libc_sb, + .wc_tolower = tolower_libc_sb, + .max_chr = UCHAR_MAX, +}; + +static const struct ctype_methods ctype_methods_libc_utf8 = { + .strlower = strlower_libc_mb, + .strtitle = strtitle_libc_mb, + .strupper = strupper_libc_mb, + .wc_isdigit = wc_isdigit_libc_mb, + .wc_isalpha = wc_isalpha_libc_mb, + .wc_isalnum = wc_isalnum_libc_mb, + .wc_isupper = wc_isupper_libc_mb, + .wc_islower = wc_islower_libc_mb, + .wc_isgraph = wc_isgraph_libc_mb, + .wc_isprint = wc_isprint_libc_mb, + .wc_ispunct = wc_ispunct_libc_mb, + .wc_isspace = wc_isspace_libc_mb, + .char_is_cased = char_is_cased_libc, + .char_tolower = char_tolower_libc, + .wc_toupper = toupper_libc_mb, + .wc_tolower = tolower_libc_mb, +}; + static const struct collate_methods collate_methods_libc = { .strncoll = strncoll_libc, .strnxfrm = strnxfrm_libc, @@ -119,36 +397,6 @@ static const struct collate_methods collate_methods_libc_win32_utf8 = { }; #endif -size_t -strlower_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale) -{ - if (pg_database_encoding_max_length() > 1) - return strlower_libc_mb(dst, dstsize, src, srclen, locale); - else - return strlower_libc_sb(dst, dstsize, src, srclen, locale); -} - -size_t -strtitle_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale) -{ - if (pg_database_encoding_max_length() > 1) - return strtitle_libc_mb(dst, dstsize, src, srclen, locale); - else - return strtitle_libc_sb(dst, dstsize, src, srclen, locale); -} - -size_t -strupper_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale) -{ - if (pg_database_encoding_max_length() > 1) - return strupper_libc_mb(dst, dstsize, src, srclen, locale); - else - return strupper_libc_sb(dst, dstsize, src, srclen, locale); -} - static size_t strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) @@ -209,7 +457,7 @@ strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, /* Output workspace cannot have more codes than input bytes */ workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t)); - char2wchar(workspace, srclen + 1, src, srclen, locale); + char2wchar(workspace, srclen + 1, src, srclen, loc); for (curr_char = 0; workspace[curr_char] != 0; curr_char++) workspace[curr_char] = towlower_l(workspace[curr_char], loc); @@ -220,7 +468,7 @@ strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, max_size = curr_char * pg_database_encoding_max_length(); result = palloc(max_size + 1); - result_size = wchar2char(result, workspace, max_size + 1, locale); + result_size = wchar2char(result, workspace, max_size + 1, loc); if (result_size + 1 > destsize) return result_size; @@ -304,7 +552,7 @@ strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, /* Output workspace cannot have more codes than input bytes */ workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t)); - char2wchar(workspace, srclen + 1, src, srclen, locale); + char2wchar(workspace, srclen + 1, src, srclen, loc); for (curr_char = 0; workspace[curr_char] != 0; curr_char++) { @@ -321,7 +569,7 @@ strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, max_size = curr_char * pg_database_encoding_max_length(); result = palloc(max_size + 1); - result_size = wchar2char(result, workspace, max_size + 1, locale); + result_size = wchar2char(result, workspace, max_size + 1, loc); if (result_size + 1 > destsize) return result_size; @@ -392,7 +640,7 @@ strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, /* Output workspace cannot have more codes than input bytes */ workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t)); - char2wchar(workspace, srclen + 1, src, srclen, locale); + char2wchar(workspace, srclen + 1, src, srclen, loc); for (curr_char = 0; workspace[curr_char] != 0; curr_char++) workspace[curr_char] = towupper_l(workspace[curr_char], loc); @@ -403,7 +651,7 @@ strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, max_size = curr_char * pg_database_encoding_max_length(); result = palloc(max_size + 1); - result_size = wchar2char(result, workspace, max_size + 1, locale); + result_size = wchar2char(result, workspace, max_size + 1, loc); if (result_size + 1 > destsize) return result_size; @@ -465,7 +713,6 @@ create_pg_locale_libc(Oid collid, MemoryContext context) loc = make_libc_collator(collate, ctype); result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); - result->provider = COLLPROVIDER_LIBC; result->deterministic = true; result->collate_is_c = (strcmp(collate, "C") == 0) || (strcmp(collate, "POSIX") == 0); @@ -481,6 +728,15 @@ create_pg_locale_libc(Oid collid, MemoryContext context) #endif result->collate = &collate_methods_libc; } + if (!result->ctype_is_c) + { + if (GetDatabaseEncoding() == PG_UTF8) + result->ctype = &ctype_methods_libc_utf8; + else if (pg_database_encoding_max_length() > 1) + result->ctype = &ctype_methods_libc_other_mb; + else + result->ctype = &ctype_methods_libc_sb; + } return result; } @@ -576,8 +832,6 @@ strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, const char *arg2n; int result; - Assert(locale->provider == COLLPROVIDER_LIBC); - if (bufsize1 + bufsize2 > TEXTBUFLEN) buf = palloc(bufsize1 + bufsize2); @@ -632,8 +886,6 @@ strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, size_t bufsize = srclen + 1; size_t result; - Assert(locale->provider == COLLPROVIDER_LIBC); - if (srclen == -1) return strxfrm_l(dest, src, destsize, locale->info.lt); @@ -742,7 +994,6 @@ strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2, int r; int result; - Assert(locale->provider == COLLPROVIDER_LIBC); Assert(GetDatabaseEncoding() == PG_UTF8); if (len1 == -1) @@ -879,7 +1130,7 @@ wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc) * zero-terminated. The output will be zero-terminated iff there is room. */ size_t -wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale) +wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc) { size_t result; @@ -909,7 +1160,7 @@ wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale) } else #endif /* WIN32 */ - if (locale == (pg_locale_t) 0) + if (loc == (locale_t) 0) { /* Use wcstombs directly for the default locale */ result = wcstombs(to, from, tolen); @@ -917,7 +1168,7 @@ wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale) else { /* Use wcstombs_l for nondefault locales */ - result = wcstombs_l(to, from, tolen, locale->info.lt); + result = wcstombs_l(to, from, tolen, loc); } return result; @@ -934,7 +1185,7 @@ wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale) */ size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, - pg_locale_t locale) + locale_t loc) { size_t result; @@ -969,7 +1220,7 @@ char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, /* mbstowcs requires ending '\0' */ char *str = pnstrdup(from, fromlen); - if (locale == (pg_locale_t) 0) + if (loc == (locale_t) 0) { /* Use mbstowcs directly for the default locale */ result = mbstowcs(to, str, tolen); @@ -977,7 +1228,7 @@ char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, else { /* Use mbstowcs_l for nondefault locales */ - result = mbstowcs_l(to, str, tolen, locale->info.lt); + result = mbstowcs_l(to, str, tolen, loc); } pfree(str); diff --git a/src/backend/utils/adt/pg_lsn.c b/src/backend/utils/adt/pg_lsn.c index 16311590a14a0..12de2446f5b69 100644 --- a/src/backend/utils/adt/pg_lsn.c +++ b/src/backend/utils/adt/pg_lsn.c @@ -83,7 +83,7 @@ pg_lsn_out(PG_FUNCTION_ARGS) char buf[MAXPG_LSNLEN + 1]; char *result; - snprintf(buf, sizeof buf, "%X/%X", LSN_FORMAT_ARGS(lsn)); + snprintf(buf, sizeof buf, "%X/%08X", LSN_FORMAT_ARGS(lsn)); result = pstrdup(buf); PG_RETURN_CSTRING(result); } diff --git a/src/backend/utils/adt/pg_upgrade_support.c b/src/backend/utils/adt/pg_upgrade_support.c index d44f8c262baa2..a4f8b4faa90dc 100644 --- a/src/backend/utils/adt/pg_upgrade_support.c +++ b/src/backend/utils/adt/pg_upgrade_support.c @@ -21,6 +21,7 @@ #include "commands/extension.h" #include "miscadmin.h" #include "replication/logical.h" +#include "replication/logicallauncher.h" #include "replication/origin.h" #include "replication/worker_internal.h" #include "storage/lmgr.h" @@ -410,3 +411,21 @@ binary_upgrade_replorigin_advance(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +/* + * binary_upgrade_create_conflict_detection_slot + * + * Create a replication slot to retain information necessary for conflict + * detection such as dead tuples, commit timestamps, and origins. + */ +Datum +binary_upgrade_create_conflict_detection_slot(PG_FUNCTION_ARGS) +{ + CHECK_IS_BINARY_UPGRADE; + + CreateConflictDetectionSlot(); + + ReplicationSlotRelease(); + + PG_RETURN_VOID(); +} diff --git a/src/backend/utils/adt/regproc.c b/src/backend/utils/adt/regproc.c index 5ee608a2b3921..b8bbe95e82eb8 100644 --- a/src/backend/utils/adt/regproc.c +++ b/src/backend/utils/adt/regproc.c @@ -30,6 +30,7 @@ #include "catalog/pg_ts_config.h" #include "catalog/pg_ts_dict.h" #include "catalog/pg_type.h" +#include "commands/dbcommands.h" #include "lib/stringinfo.h" #include "mb/pg_wchar.h" #include "miscadmin.h" @@ -1763,6 +1764,123 @@ regnamespacesend(PG_FUNCTION_ARGS) return oidsend(fcinfo); } +/* + * regdatabasein - converts database name to database OID + * + * We also accept a numeric OID, for symmetry with the output routine. + * + * '-' signifies unknown (OID 0). In all other cases, the input must + * match an existing pg_database entry. + */ +Datum +regdatabasein(PG_FUNCTION_ARGS) +{ + char *db_name_or_oid = PG_GETARG_CSTRING(0); + Node *escontext = fcinfo->context; + Oid result; + List *names; + + /* Handle "-" or numeric OID */ + if (parseDashOrOid(db_name_or_oid, &result, escontext)) + PG_RETURN_OID(result); + + /* The rest of this wouldn't work in bootstrap mode */ + if (IsBootstrapProcessingMode()) + elog(ERROR, "regdatabase values must be OIDs in bootstrap mode"); + + /* Normal case: see if the name matches any pg_database entry. */ + names = stringToQualifiedNameList(db_name_or_oid, escontext); + if (names == NIL) + PG_RETURN_NULL(); + + if (list_length(names) != 1) + ereturn(escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_NAME), + errmsg("invalid name syntax"))); + + result = get_database_oid(strVal(linitial(names)), true); + + if (!OidIsValid(result)) + ereturn(escontext, (Datum) 0, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("database \"%s\" does not exist", + strVal(linitial(names))))); + + PG_RETURN_OID(result); +} + +/* + * to_regdatabase - converts database name to database OID + * + * If the name is not found, we return NULL. + */ +Datum +to_regdatabase(PG_FUNCTION_ARGS) +{ + char *db_name = text_to_cstring(PG_GETARG_TEXT_PP(0)); + Datum result; + ErrorSaveContext escontext = {T_ErrorSaveContext}; + + if (!DirectInputFunctionCallSafe(regdatabasein, db_name, + InvalidOid, -1, + (Node *) &escontext, + &result)) + PG_RETURN_NULL(); + PG_RETURN_DATUM(result); +} + +/* + * regdatabaseout - converts database OID to database name + */ +Datum +regdatabaseout(PG_FUNCTION_ARGS) +{ + Oid dboid = PG_GETARG_OID(0); + char *result; + + if (dboid == InvalidOid) + { + result = pstrdup("-"); + PG_RETURN_CSTRING(result); + } + + result = get_database_name(dboid); + + if (result) + { + /* pstrdup is not really necessary, but it avoids a compiler warning */ + result = pstrdup(quote_identifier(result)); + } + else + { + /* If OID doesn't match any database, return it numerically */ + result = (char *) palloc(NAMEDATALEN); + snprintf(result, NAMEDATALEN, "%u", dboid); + } + + PG_RETURN_CSTRING(result); +} + +/* + * regdatabaserecv - converts external binary format to regdatabase + */ +Datum +regdatabaserecv(PG_FUNCTION_ARGS) +{ + /* Exactly the same as oidrecv, so share code */ + return oidrecv(fcinfo); +} + +/* + * regdatabasesend - converts regdatabase to binary format + */ +Datum +regdatabasesend(PG_FUNCTION_ARGS) +{ + /* Exactly the same as oidsend, so share code */ + return oidsend(fcinfo); +} + /* * text_regclass: convert text to regclass * diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c index 6239900fa2892..059fc5ebf601a 100644 --- a/src/backend/utils/adt/ri_triggers.c +++ b/src/backend/utils/adt/ri_triggers.c @@ -30,7 +30,6 @@ #include "access/xact.h" #include "catalog/pg_collation.h" #include "catalog/pg_constraint.h" -#include "catalog/pg_proc.h" #include "commands/trigger.h" #include "executor/executor.h" #include "executor/spi.h" @@ -46,7 +45,6 @@ #include "utils/inval.h" #include "utils/lsyscache.h" #include "utils/memutils.h" -#include "utils/rangetypes.h" #include "utils/rel.h" #include "utils/rls.h" #include "utils/ruleutils.h" diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index a96b1b9c0bc69..17fbfa9b41063 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -103,7 +103,6 @@ #include "access/table.h" #include "access/tableam.h" #include "access/visibilitymap.h" -#include "catalog/pg_am.h" #include "catalog/pg_collation.h" #include "catalog/pg_operator.h" #include "catalog/pg_statistic.h" @@ -3799,18 +3798,25 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner, List *hashclauses, Selectivity *innerbucketsize) { - List *clauses = list_copy(hashclauses); - List *otherclauses = NIL; - double ndistinct = 1.0; + List *clauses; + List *otherclauses; + double ndistinct; if (list_length(hashclauses) <= 1) - + { /* * Nothing to do for a single clause. Could we employ univariate * extended stat here? */ return hashclauses; + } + /* "clauses" is the list of hashclauses we've not dealt with yet */ + clauses = list_copy(hashclauses); + /* "otherclauses" holds clauses we are going to return to caller */ + otherclauses = NIL; + /* current estimate of ndistinct */ + ndistinct = 1.0; while (clauses != NIL) { ListCell *lc; @@ -3875,12 +3881,13 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner, group_rel = root->simple_rel_array[relid]; } else if (group_relid != relid) - + { /* * Being in the group forming state we don't need other * clauses. */ continue; + } /* * We're going to add the new clause to the varinfos list. We @@ -4620,6 +4627,7 @@ convert_to_scalar(Datum value, Oid valuetypid, Oid collid, double *scaledvalue, case REGDICTIONARYOID: case REGROLEOID: case REGNAMESPACEOID: + case REGDATABASEOID: *scaledvalue = convert_numeric_to_scalar(value, valuetypid, &failure); *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid, @@ -4752,6 +4760,7 @@ convert_numeric_to_scalar(Datum value, Oid typid, bool *failure) case REGDICTIONARYOID: case REGROLEOID: case REGNAMESPACEOID: + case REGDATABASEOID: /* we can treat OIDs as integers... */ return (double) DatumGetObjectId(value); } diff --git a/src/backend/utils/adt/tid.c b/src/backend/utils/adt/tid.c index 1b0df1117171a..39dab3e42df58 100644 --- a/src/backend/utils/adt/tid.c +++ b/src/backend/utils/adt/tid.c @@ -84,7 +84,7 @@ tidin(PG_FUNCTION_ARGS) /* * Cope with possibility that unsigned long is wider than BlockNumber, in * which case strtoul will not raise an error for some values that are out - * of the range of BlockNumber. (See similar code in oidin().) + * of the range of BlockNumber. (See similar code in uint32in_subr().) */ #if SIZEOF_LONG > 4 if (cvt != (unsigned long) blockNumber && diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c index 347089b762646..25cff56c3d07e 100644 --- a/src/backend/utils/adt/timestamp.c +++ b/src/backend/utils/adt/timestamp.c @@ -5312,10 +5312,10 @@ isoweekdate2date(int isoweek, int wday, int *year, int *mon, int *mday) int date2isoweek(int year, int mon, int mday) { - float8 result; int day0, day4, - dayn; + dayn, + week; /* current day */ dayn = date2j(year, mon, mday); @@ -5338,13 +5338,13 @@ date2isoweek(int year, int mon, int mday) day0 = j2day(day4 - 1); } - result = (dayn - (day4 - day0)) / 7 + 1; + week = (dayn - (day4 - day0)) / 7 + 1; /* * Sometimes the last few days in a year will fall into the first week of * the next year, so check for this. */ - if (result >= 52) + if (week >= 52) { day4 = date2j(year + 1, 1, 4); @@ -5352,10 +5352,10 @@ date2isoweek(int year, int mon, int mday) day0 = j2day(day4 - 1); if (dayn >= day4 - day0) - result = (dayn - (day4 - day0)) / 7 + 1; + week = (dayn - (day4 - day0)) / 7 + 1; } - return (int) result; + return week; } @@ -5367,10 +5367,10 @@ date2isoweek(int year, int mon, int mday) int date2isoyear(int year, int mon, int mday) { - float8 result; int day0, day4, - dayn; + dayn, + week; /* current day */ dayn = date2j(year, mon, mday); @@ -5395,13 +5395,13 @@ date2isoyear(int year, int mon, int mday) year--; } - result = (dayn - (day4 - day0)) / 7 + 1; + week = (dayn - (day4 - day0)) / 7 + 1; /* * Sometimes the last few days in a year will fall into the first week of * the next year, so check for this. */ - if (result >= 52) + if (week >= 52) { day4 = date2j(year + 1, 1, 4); @@ -6477,7 +6477,7 @@ timestamp2timestamptz_opt_overflow(Timestamp timestamp, int *overflow) if (TIMESTAMP_NOT_FINITE(timestamp)) return timestamp; - /* We don't expect this to fail, but check it pro forma */ + /* timestamp2tm should not fail on valid timestamps, but cope */ if (timestamp2tm(timestamp, NULL, tm, &fsec, NULL, NULL) == 0) { tz = DetermineTimeZoneOffset(tm, session_timezone); @@ -6485,23 +6485,22 @@ timestamp2timestamptz_opt_overflow(Timestamp timestamp, int *overflow) result = dt2local(timestamp, -tz); if (IS_VALID_TIMESTAMP(result)) - { return result; + } + + if (overflow) + { + if (timestamp < 0) + { + *overflow = -1; + TIMESTAMP_NOBEGIN(result); } - else if (overflow) + else { - if (result < MIN_TIMESTAMP) - { - *overflow = -1; - TIMESTAMP_NOBEGIN(result); - } - else - { - *overflow = 1; - TIMESTAMP_NOEND(result); - } - return result; + *overflow = 1; + TIMESTAMP_NOEND(result); } + return result; } ereport(ERROR, @@ -6531,8 +6530,27 @@ timestamptz_timestamp(PG_FUNCTION_ARGS) PG_RETURN_TIMESTAMP(timestamptz2timestamp(timestamp)); } +/* + * Convert timestamptz to timestamp, throwing error for overflow. + */ static Timestamp timestamptz2timestamp(TimestampTz timestamp) +{ + return timestamptz2timestamp_opt_overflow(timestamp, NULL); +} + +/* + * Convert timestamp with time zone to timestamp. + * + * On successful conversion, *overflow is set to zero if it's not NULL. + * + * If the timestamptz is finite but out of the valid range for timestamp, then: + * if overflow is NULL, we throw an out-of-range error. + * if overflow is not NULL, we store +1 or -1 there to indicate the sign + * of the overflow, and return the appropriate timestamp infinity. + */ +Timestamp +timestamptz2timestamp_opt_overflow(TimestampTz timestamp, int *overflow) { Timestamp result; struct pg_tm tt, @@ -6540,18 +6558,53 @@ timestamptz2timestamp(TimestampTz timestamp) fsec_t fsec; int tz; + if (overflow) + *overflow = 0; + if (TIMESTAMP_NOT_FINITE(timestamp)) result = timestamp; else { if (timestamp2tm(timestamp, &tz, tm, &fsec, NULL, NULL) != 0) + { + if (overflow) + { + if (timestamp < 0) + { + *overflow = -1; + TIMESTAMP_NOBEGIN(result); + } + else + { + *overflow = 1; + TIMESTAMP_NOEND(result); + } + return result; + } ereport(ERROR, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("timestamp out of range"))); + } if (tm2timestamp(tm, fsec, NULL, &result) != 0) + { + if (overflow) + { + if (timestamp < 0) + { + *overflow = -1; + TIMESTAMP_NOBEGIN(result); + } + else + { + *overflow = 1; + TIMESTAMP_NOEND(result); + } + return result; + } ereport(ERROR, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("timestamp out of range"))); + } } return result; } diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 3e4d5568bde89..ffae8c23abfaf 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -35,7 +35,6 @@ #include "port/pg_bswap.h" #include "regex/regex.h" #include "utils/builtins.h" -#include "utils/bytea.h" #include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/memutils.h" @@ -43,10 +42,6 @@ #include "utils/sortsupport.h" #include "utils/varlena.h" - -/* GUC variable */ -int bytea_output = BYTEA_OUTPUT_HEX; - typedef struct varlena VarString; /* @@ -148,12 +143,6 @@ static int text_position_get_match_pos(TextPositionState *state); static void text_position_cleanup(TextPositionState *state); static void check_collation_set(Oid collid); static int text_cmp(text *arg1, text *arg2, Oid collid); -static bytea *bytea_catenate(bytea *t1, bytea *t2); -static bytea *bytea_substring(Datum str, - int S, - int L, - bool length_not_specified); -static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl); static void appendStringInfoText(StringInfo str, const text *t); static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate); static void split_text_accum_result(SplitTextOutputData *tstate, @@ -279,307 +268,6 @@ text_to_cstring_buffer(const text *src, char *dst, size_t dst_len) * USER I/O ROUTINES * *****************************************************************************/ - -#define VAL(CH) ((CH) - '0') -#define DIG(VAL) ((VAL) + '0') - -/* - * byteain - converts from printable representation of byte array - * - * Non-printable characters must be passed as '\nnn' (octal) and are - * converted to internal form. '\' must be passed as '\\'. - * ereport(ERROR, ...) if bad form. - * - * BUGS: - * The input is scanned twice. - * The error checking of input is minimal. - */ -Datum -byteain(PG_FUNCTION_ARGS) -{ - char *inputText = PG_GETARG_CSTRING(0); - Node *escontext = fcinfo->context; - char *tp; - char *rp; - int bc; - bytea *result; - - /* Recognize hex input */ - if (inputText[0] == '\\' && inputText[1] == 'x') - { - size_t len = strlen(inputText); - - bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */ - result = palloc(bc); - bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result), - escontext); - SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */ - - PG_RETURN_BYTEA_P(result); - } - - /* Else, it's the traditional escaped style */ - for (bc = 0, tp = inputText; *tp != '\0'; bc++) - { - if (tp[0] != '\\') - tp++; - else if ((tp[0] == '\\') && - (tp[1] >= '0' && tp[1] <= '3') && - (tp[2] >= '0' && tp[2] <= '7') && - (tp[3] >= '0' && tp[3] <= '7')) - tp += 4; - else if ((tp[0] == '\\') && - (tp[1] == '\\')) - tp += 2; - else - { - /* - * one backslash, not followed by another or ### valid octal - */ - ereturn(escontext, (Datum) 0, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "bytea"))); - } - } - - bc += VARHDRSZ; - - result = (bytea *) palloc(bc); - SET_VARSIZE(result, bc); - - tp = inputText; - rp = VARDATA(result); - while (*tp != '\0') - { - if (tp[0] != '\\') - *rp++ = *tp++; - else if ((tp[0] == '\\') && - (tp[1] >= '0' && tp[1] <= '3') && - (tp[2] >= '0' && tp[2] <= '7') && - (tp[3] >= '0' && tp[3] <= '7')) - { - bc = VAL(tp[1]); - bc <<= 3; - bc += VAL(tp[2]); - bc <<= 3; - *rp++ = bc + VAL(tp[3]); - - tp += 4; - } - else if ((tp[0] == '\\') && - (tp[1] == '\\')) - { - *rp++ = '\\'; - tp += 2; - } - else - { - /* - * We should never get here. The first pass should not allow it. - */ - ereturn(escontext, (Datum) 0, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "bytea"))); - } - } - - PG_RETURN_BYTEA_P(result); -} - -/* - * byteaout - converts to printable representation of byte array - * - * In the traditional escaped format, non-printable characters are - * printed as '\nnn' (octal) and '\' as '\\'. - */ -Datum -byteaout(PG_FUNCTION_ARGS) -{ - bytea *vlena = PG_GETARG_BYTEA_PP(0); - char *result; - char *rp; - - if (bytea_output == BYTEA_OUTPUT_HEX) - { - /* Print hex format */ - rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1); - *rp++ = '\\'; - *rp++ = 'x'; - rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp); - } - else if (bytea_output == BYTEA_OUTPUT_ESCAPE) - { - /* Print traditional escaped format */ - char *vp; - uint64 len; - int i; - - len = 1; /* empty string has 1 char */ - vp = VARDATA_ANY(vlena); - for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++) - { - if (*vp == '\\') - len += 2; - else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e) - len += 4; - else - len++; - } - - /* - * In principle len can't overflow uint32 if the input fit in 1GB, but - * for safety let's check rather than relying on palloc's internal - * check. - */ - if (len > MaxAllocSize) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg_internal("result of bytea output conversion is too large"))); - rp = result = (char *) palloc(len); - - vp = VARDATA_ANY(vlena); - for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++) - { - if (*vp == '\\') - { - *rp++ = '\\'; - *rp++ = '\\'; - } - else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e) - { - int val; /* holds unprintable chars */ - - val = *vp; - rp[0] = '\\'; - rp[3] = DIG(val & 07); - val >>= 3; - rp[2] = DIG(val & 07); - val >>= 3; - rp[1] = DIG(val & 03); - rp += 4; - } - else - *rp++ = *vp; - } - } - else - { - elog(ERROR, "unrecognized \"bytea_output\" setting: %d", - bytea_output); - rp = result = NULL; /* keep compiler quiet */ - } - *rp = '\0'; - PG_RETURN_CSTRING(result); -} - -/* - * bytearecv - converts external binary format to bytea - */ -Datum -bytearecv(PG_FUNCTION_ARGS) -{ - StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); - bytea *result; - int nbytes; - - nbytes = buf->len - buf->cursor; - result = (bytea *) palloc(nbytes + VARHDRSZ); - SET_VARSIZE(result, nbytes + VARHDRSZ); - pq_copymsgbytes(buf, VARDATA(result), nbytes); - PG_RETURN_BYTEA_P(result); -} - -/* - * byteasend - converts bytea to binary format - * - * This is a special case: just copy the input... - */ -Datum -byteasend(PG_FUNCTION_ARGS) -{ - bytea *vlena = PG_GETARG_BYTEA_P_COPY(0); - - PG_RETURN_BYTEA_P(vlena); -} - -Datum -bytea_string_agg_transfn(PG_FUNCTION_ARGS) -{ - StringInfo state; - - state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); - - /* Append the value unless null, preceding it with the delimiter. */ - if (!PG_ARGISNULL(1)) - { - bytea *value = PG_GETARG_BYTEA_PP(1); - bool isfirst = false; - - /* - * You might think we can just throw away the first delimiter, however - * we must keep it as we may be a parallel worker doing partial - * aggregation building a state to send to the main process. We need - * to keep the delimiter of every aggregation so that the combine - * function can properly join up the strings of two separately - * partially aggregated results. The first delimiter is only stripped - * off in the final function. To know how much to strip off the front - * of the string, we store the length of the first delimiter in the - * StringInfo's cursor field, which we don't otherwise need here. - */ - if (state == NULL) - { - state = makeStringAggState(fcinfo); - isfirst = true; - } - - if (!PG_ARGISNULL(2)) - { - bytea *delim = PG_GETARG_BYTEA_PP(2); - - appendBinaryStringInfo(state, VARDATA_ANY(delim), - VARSIZE_ANY_EXHDR(delim)); - if (isfirst) - state->cursor = VARSIZE_ANY_EXHDR(delim); - } - - appendBinaryStringInfo(state, VARDATA_ANY(value), - VARSIZE_ANY_EXHDR(value)); - } - - /* - * The transition type for string_agg() is declared to be "internal", - * which is a pass-by-value type the same size as a pointer. - */ - if (state) - PG_RETURN_POINTER(state); - PG_RETURN_NULL(); -} - -Datum -bytea_string_agg_finalfn(PG_FUNCTION_ARGS) -{ - StringInfo state; - - /* cannot be called directly because of internal-type argument */ - Assert(AggCheckCallContext(fcinfo, NULL)); - - state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); - - if (state != NULL) - { - /* As per comment in transfn, strip data before the cursor position */ - bytea *result; - int strippedlen = state->len - state->cursor; - - result = (bytea *) palloc(strippedlen + VARHDRSZ); - SET_VARSIZE(result, strippedlen + VARHDRSZ); - memcpy(VARDATA(result), &state->data[state->cursor], strippedlen); - PG_RETURN_BYTEA_P(result); - } - else - PG_RETURN_NULL(); -} - /* * textin - converts cstring to internal representation */ @@ -2959,552 +2647,91 @@ bttext_pattern_sortsupport(PG_FUNCTION_ARGS) } -/*------------------------------------------------------------- - * byteaoctetlen - * - * get the number of bytes contained in an instance of type 'bytea' - *------------------------------------------------------------- +/* text_name() + * Converts a text type to a Name type. */ Datum -byteaoctetlen(PG_FUNCTION_ARGS) +text_name(PG_FUNCTION_ARGS) { - Datum str = PG_GETARG_DATUM(0); + text *s = PG_GETARG_TEXT_PP(0); + Name result; + int len; - /* We need not detoast the input at all */ - PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ); + len = VARSIZE_ANY_EXHDR(s); + + /* Truncate oversize input */ + if (len >= NAMEDATALEN) + len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1); + + /* We use palloc0 here to ensure result is zero-padded */ + result = (Name) palloc0(NAMEDATALEN); + memcpy(NameStr(*result), VARDATA_ANY(s), len); + + PG_RETURN_NAME(result); } -/* - * byteacat - - * takes two bytea* and returns a bytea* that is the concatenation of - * the two. - * - * Cloned from textcat and modified as required. +/* name_text() + * Converts a Name type to a text type. */ Datum -byteacat(PG_FUNCTION_ARGS) +name_text(PG_FUNCTION_ARGS) { - bytea *t1 = PG_GETARG_BYTEA_PP(0); - bytea *t2 = PG_GETARG_BYTEA_PP(1); + Name s = PG_GETARG_NAME(0); - PG_RETURN_BYTEA_P(bytea_catenate(t1, t2)); + PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s))); } + /* - * bytea_catenate - * Guts of byteacat(), broken out so it can be used by other functions + * textToQualifiedNameList - convert a text object to list of names * - * Arguments can be in short-header form, but not compressed or out-of-line + * This implements the input parsing needed by nextval() and other + * functions that take a text parameter representing a qualified name. + * We split the name at dots, downcase if not double-quoted, and + * truncate names if they're too long. */ -static bytea * -bytea_catenate(bytea *t1, bytea *t2) +List * +textToQualifiedNameList(text *textval) { - bytea *result; - int len1, - len2, - len; - char *ptr; + char *rawname; + List *result = NIL; + List *namelist; + ListCell *l; - len1 = VARSIZE_ANY_EXHDR(t1); - len2 = VARSIZE_ANY_EXHDR(t2); + /* Convert to C string (handles possible detoasting). */ + /* Note we rely on being able to modify rawname below. */ + rawname = text_to_cstring(textval); - /* paranoia ... probably should throw error instead? */ - if (len1 < 0) - len1 = 0; - if (len2 < 0) - len2 = 0; + if (!SplitIdentifierString(rawname, '.', &namelist)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_NAME), + errmsg("invalid name syntax"))); - len = len1 + len2 + VARHDRSZ; - result = (bytea *) palloc(len); + if (namelist == NIL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_NAME), + errmsg("invalid name syntax"))); - /* Set size of result string... */ - SET_VARSIZE(result, len); + foreach(l, namelist) + { + char *curname = (char *) lfirst(l); - /* Fill data field of result string... */ - ptr = VARDATA(result); - if (len1 > 0) - memcpy(ptr, VARDATA_ANY(t1), len1); - if (len2 > 0) - memcpy(ptr + len1, VARDATA_ANY(t2), len2); + result = lappend(result, makeString(pstrdup(curname))); + } + + pfree(rawname); + list_free(namelist); return result; } -#define PG_STR_GET_BYTEA(str_) \ - DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_))) - /* - * bytea_substr() - * Return a substring starting at the specified position. - * Cloned from text_substr and modified as required. + * SplitIdentifierString --- parse a string containing identifiers * - * Input: - * - string - * - starting position (is one-based) - * - string length (optional) - * - * If the starting position is zero or less, then return from the start of the string - * adjusting the length to be consistent with the "negative start" per SQL. - * If the length is less than zero, an ERROR is thrown. If no third argument - * (length) is provided, the length to the end of the string is assumed. - */ -Datum -bytea_substr(PG_FUNCTION_ARGS) -{ - PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0), - PG_GETARG_INT32(1), - PG_GETARG_INT32(2), - false)); -} - -/* - * bytea_substr_no_len - - * Wrapper to avoid opr_sanity failure due to - * one function accepting a different number of args. - */ -Datum -bytea_substr_no_len(PG_FUNCTION_ARGS) -{ - PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0), - PG_GETARG_INT32(1), - -1, - true)); -} - -static bytea * -bytea_substring(Datum str, - int S, - int L, - bool length_not_specified) -{ - int32 S1; /* adjusted start position */ - int32 L1; /* adjusted substring length */ - int32 E; /* end position */ - - /* - * The logic here should generally match text_substring(). - */ - S1 = Max(S, 1); - - if (length_not_specified) - { - /* - * Not passed a length - DatumGetByteaPSlice() grabs everything to the - * end of the string if we pass it a negative value for length. - */ - L1 = -1; - } - else if (L < 0) - { - /* SQL99 says to throw an error for E < S, i.e., negative length */ - ereport(ERROR, - (errcode(ERRCODE_SUBSTRING_ERROR), - errmsg("negative substring length not allowed"))); - L1 = -1; /* silence stupider compilers */ - } - else if (pg_add_s32_overflow(S, L, &E)) - { - /* - * L could be large enough for S + L to overflow, in which case the - * substring must run to end of string. - */ - L1 = -1; - } - else - { - /* - * A zero or negative value for the end position can happen if the - * start was negative or one. SQL99 says to return a zero-length - * string. - */ - if (E < 1) - return PG_STR_GET_BYTEA(""); - - L1 = E - S1; - } - - /* - * If the start position is past the end of the string, SQL99 says to - * return a zero-length string -- DatumGetByteaPSlice() will do that for - * us. We need only convert S1 to zero-based starting position. - */ - return DatumGetByteaPSlice(str, S1 - 1, L1); -} - -/* - * byteaoverlay - * Replace specified substring of first string with second - * - * The SQL standard defines OVERLAY() in terms of substring and concatenation. - * This code is a direct implementation of what the standard says. - */ -Datum -byteaoverlay(PG_FUNCTION_ARGS) -{ - bytea *t1 = PG_GETARG_BYTEA_PP(0); - bytea *t2 = PG_GETARG_BYTEA_PP(1); - int sp = PG_GETARG_INT32(2); /* substring start position */ - int sl = PG_GETARG_INT32(3); /* substring length */ - - PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl)); -} - -Datum -byteaoverlay_no_len(PG_FUNCTION_ARGS) -{ - bytea *t1 = PG_GETARG_BYTEA_PP(0); - bytea *t2 = PG_GETARG_BYTEA_PP(1); - int sp = PG_GETARG_INT32(2); /* substring start position */ - int sl; - - sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */ - PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl)); -} - -static bytea * -bytea_overlay(bytea *t1, bytea *t2, int sp, int sl) -{ - bytea *result; - bytea *s1; - bytea *s2; - int sp_pl_sl; - - /* - * Check for possible integer-overflow cases. For negative sp, throw a - * "substring length" error because that's what should be expected - * according to the spec's definition of OVERLAY(). - */ - if (sp <= 0) - ereport(ERROR, - (errcode(ERRCODE_SUBSTRING_ERROR), - errmsg("negative substring length not allowed"))); - if (pg_add_s32_overflow(sp, sl, &sp_pl_sl)) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("integer out of range"))); - - s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false); - s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true); - result = bytea_catenate(s1, t2); - result = bytea_catenate(result, s2); - - return result; -} - -/* - * bit_count - */ -Datum -bytea_bit_count(PG_FUNCTION_ARGS) -{ - bytea *t1 = PG_GETARG_BYTEA_PP(0); - - PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1))); -} - -/* - * byteapos - - * Return the position of the specified substring. - * Implements the SQL POSITION() function. - * Cloned from textpos and modified as required. - */ -Datum -byteapos(PG_FUNCTION_ARGS) -{ - bytea *t1 = PG_GETARG_BYTEA_PP(0); - bytea *t2 = PG_GETARG_BYTEA_PP(1); - int pos; - int px, - p; - int len1, - len2; - char *p1, - *p2; - - len1 = VARSIZE_ANY_EXHDR(t1); - len2 = VARSIZE_ANY_EXHDR(t2); - - if (len2 <= 0) - PG_RETURN_INT32(1); /* result for empty pattern */ - - p1 = VARDATA_ANY(t1); - p2 = VARDATA_ANY(t2); - - pos = 0; - px = (len1 - len2); - for (p = 0; p <= px; p++) - { - if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0)) - { - pos = p + 1; - break; - }; - p1++; - }; - - PG_RETURN_INT32(pos); -} - -/*------------------------------------------------------------- - * byteaGetByte - * - * this routine treats "bytea" as an array of bytes. - * It returns the Nth byte (a number between 0 and 255). - *------------------------------------------------------------- - */ -Datum -byteaGetByte(PG_FUNCTION_ARGS) -{ - bytea *v = PG_GETARG_BYTEA_PP(0); - int32 n = PG_GETARG_INT32(1); - int len; - int byte; - - len = VARSIZE_ANY_EXHDR(v); - - if (n < 0 || n >= len) - ereport(ERROR, - (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), - errmsg("index %d out of valid range, 0..%d", - n, len - 1))); - - byte = ((unsigned char *) VARDATA_ANY(v))[n]; - - PG_RETURN_INT32(byte); -} - -/*------------------------------------------------------------- - * byteaGetBit - * - * This routine treats a "bytea" type like an array of bits. - * It returns the value of the Nth bit (0 or 1). - * - *------------------------------------------------------------- - */ -Datum -byteaGetBit(PG_FUNCTION_ARGS) -{ - bytea *v = PG_GETARG_BYTEA_PP(0); - int64 n = PG_GETARG_INT64(1); - int byteNo, - bitNo; - int len; - int byte; - - len = VARSIZE_ANY_EXHDR(v); - - if (n < 0 || n >= (int64) len * 8) - ereport(ERROR, - (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), - errmsg("index %" PRId64 " out of valid range, 0..%" PRId64, - n, (int64) len * 8 - 1))); - - /* n/8 is now known < len, so safe to cast to int */ - byteNo = (int) (n / 8); - bitNo = (int) (n % 8); - - byte = ((unsigned char *) VARDATA_ANY(v))[byteNo]; - - if (byte & (1 << bitNo)) - PG_RETURN_INT32(1); - else - PG_RETURN_INT32(0); -} - -/*------------------------------------------------------------- - * byteaSetByte - * - * Given an instance of type 'bytea' creates a new one with - * the Nth byte set to the given value. - * - *------------------------------------------------------------- - */ -Datum -byteaSetByte(PG_FUNCTION_ARGS) -{ - bytea *res = PG_GETARG_BYTEA_P_COPY(0); - int32 n = PG_GETARG_INT32(1); - int32 newByte = PG_GETARG_INT32(2); - int len; - - len = VARSIZE(res) - VARHDRSZ; - - if (n < 0 || n >= len) - ereport(ERROR, - (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), - errmsg("index %d out of valid range, 0..%d", - n, len - 1))); - - /* - * Now set the byte. - */ - ((unsigned char *) VARDATA(res))[n] = newByte; - - PG_RETURN_BYTEA_P(res); -} - -/*------------------------------------------------------------- - * byteaSetBit - * - * Given an instance of type 'bytea' creates a new one with - * the Nth bit set to the given value. - * - *------------------------------------------------------------- - */ -Datum -byteaSetBit(PG_FUNCTION_ARGS) -{ - bytea *res = PG_GETARG_BYTEA_P_COPY(0); - int64 n = PG_GETARG_INT64(1); - int32 newBit = PG_GETARG_INT32(2); - int len; - int oldByte, - newByte; - int byteNo, - bitNo; - - len = VARSIZE(res) - VARHDRSZ; - - if (n < 0 || n >= (int64) len * 8) - ereport(ERROR, - (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), - errmsg("index %" PRId64 " out of valid range, 0..%" PRId64, - n, (int64) len * 8 - 1))); - - /* n/8 is now known < len, so safe to cast to int */ - byteNo = (int) (n / 8); - bitNo = (int) (n % 8); - - /* - * sanity check! - */ - if (newBit != 0 && newBit != 1) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("new bit must be 0 or 1"))); - - /* - * Update the byte. - */ - oldByte = ((unsigned char *) VARDATA(res))[byteNo]; - - if (newBit == 0) - newByte = oldByte & (~(1 << bitNo)); - else - newByte = oldByte | (1 << bitNo); - - ((unsigned char *) VARDATA(res))[byteNo] = newByte; - - PG_RETURN_BYTEA_P(res); -} - -/* - * Return reversed bytea - */ -Datum -bytea_reverse(PG_FUNCTION_ARGS) -{ - bytea *v = PG_GETARG_BYTEA_PP(0); - const char *p = VARDATA_ANY(v); - int len = VARSIZE_ANY_EXHDR(v); - const char *endp = p + len; - bytea *result = palloc(len + VARHDRSZ); - char *dst = (char *) VARDATA(result) + len; - - SET_VARSIZE(result, len + VARHDRSZ); - - while (p < endp) - *(--dst) = *p++; - - PG_RETURN_BYTEA_P(result); -} - - -/* text_name() - * Converts a text type to a Name type. - */ -Datum -text_name(PG_FUNCTION_ARGS) -{ - text *s = PG_GETARG_TEXT_PP(0); - Name result; - int len; - - len = VARSIZE_ANY_EXHDR(s); - - /* Truncate oversize input */ - if (len >= NAMEDATALEN) - len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1); - - /* We use palloc0 here to ensure result is zero-padded */ - result = (Name) palloc0(NAMEDATALEN); - memcpy(NameStr(*result), VARDATA_ANY(s), len); - - PG_RETURN_NAME(result); -} - -/* name_text() - * Converts a Name type to a text type. - */ -Datum -name_text(PG_FUNCTION_ARGS) -{ - Name s = PG_GETARG_NAME(0); - - PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s))); -} - - -/* - * textToQualifiedNameList - convert a text object to list of names - * - * This implements the input parsing needed by nextval() and other - * functions that take a text parameter representing a qualified name. - * We split the name at dots, downcase if not double-quoted, and - * truncate names if they're too long. - */ -List * -textToQualifiedNameList(text *textval) -{ - char *rawname; - List *result = NIL; - List *namelist; - ListCell *l; - - /* Convert to C string (handles possible detoasting). */ - /* Note we rely on being able to modify rawname below. */ - rawname = text_to_cstring(textval); - - if (!SplitIdentifierString(rawname, '.', &namelist)) - ereport(ERROR, - (errcode(ERRCODE_INVALID_NAME), - errmsg("invalid name syntax"))); - - if (namelist == NIL) - ereport(ERROR, - (errcode(ERRCODE_INVALID_NAME), - errmsg("invalid name syntax"))); - - foreach(l, namelist) - { - char *curname = (char *) lfirst(l); - - result = lappend(result, makeString(pstrdup(curname))); - } - - pfree(rawname); - list_free(namelist); - - return result; -} - -/* - * SplitIdentifierString --- parse a string containing identifiers - * - * This is the guts of textToQualifiedNameList, and is exported for use in - * other situations such as parsing GUC variables. In the GUC case, it's - * important to avoid memory leaks, so the API is designed to minimize the - * amount of stuff that needs to be allocated and freed. + * This is the guts of textToQualifiedNameList, and is exported for use in + * other situations such as parsing GUC variables. In the GUC case, it's + * important to avoid memory leaks, so the API is designed to minimize the + * amount of stuff that needs to be allocated and freed. * * Inputs: * rawstring: the input string; must be overwritable! On return, it's @@ -3849,331 +3076,6 @@ SplitGUCList(char *rawstring, char separator, return true; } - -/***************************************************************************** - * Comparison Functions used for bytea - * - * Note: btree indexes need these routines not to leak memory; therefore, - * be careful to free working copies of toasted datums. Most places don't - * need to be so careful. - *****************************************************************************/ - -Datum -byteaeq(PG_FUNCTION_ARGS) -{ - Datum arg1 = PG_GETARG_DATUM(0); - Datum arg2 = PG_GETARG_DATUM(1); - bool result; - Size len1, - len2; - - /* - * We can use a fast path for unequal lengths, which might save us from - * having to detoast one or both values. - */ - len1 = toast_raw_datum_size(arg1); - len2 = toast_raw_datum_size(arg2); - if (len1 != len2) - result = false; - else - { - bytea *barg1 = DatumGetByteaPP(arg1); - bytea *barg2 = DatumGetByteaPP(arg2); - - result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2), - len1 - VARHDRSZ) == 0); - - PG_FREE_IF_COPY(barg1, 0); - PG_FREE_IF_COPY(barg2, 1); - } - - PG_RETURN_BOOL(result); -} - -Datum -byteane(PG_FUNCTION_ARGS) -{ - Datum arg1 = PG_GETARG_DATUM(0); - Datum arg2 = PG_GETARG_DATUM(1); - bool result; - Size len1, - len2; - - /* - * We can use a fast path for unequal lengths, which might save us from - * having to detoast one or both values. - */ - len1 = toast_raw_datum_size(arg1); - len2 = toast_raw_datum_size(arg2); - if (len1 != len2) - result = true; - else - { - bytea *barg1 = DatumGetByteaPP(arg1); - bytea *barg2 = DatumGetByteaPP(arg2); - - result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2), - len1 - VARHDRSZ) != 0); - - PG_FREE_IF_COPY(barg1, 0); - PG_FREE_IF_COPY(barg2, 1); - } - - PG_RETURN_BOOL(result); -} - -Datum -bytealt(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - - PG_FREE_IF_COPY(arg1, 0); - PG_FREE_IF_COPY(arg2, 1); - - PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2))); -} - -Datum -byteale(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - - PG_FREE_IF_COPY(arg1, 0); - PG_FREE_IF_COPY(arg2, 1); - - PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2))); -} - -Datum -byteagt(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - - PG_FREE_IF_COPY(arg1, 0); - PG_FREE_IF_COPY(arg2, 1); - - PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2))); -} - -Datum -byteage(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - - PG_FREE_IF_COPY(arg1, 0); - PG_FREE_IF_COPY(arg2, 1); - - PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2))); -} - -Datum -byteacmp(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - if ((cmp == 0) && (len1 != len2)) - cmp = (len1 < len2) ? -1 : 1; - - PG_FREE_IF_COPY(arg1, 0); - PG_FREE_IF_COPY(arg2, 1); - - PG_RETURN_INT32(cmp); -} - -Datum -bytea_larger(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - bytea *result; - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - result = ((cmp > 0) || ((cmp == 0) && (len1 > len2)) ? arg1 : arg2); - - PG_RETURN_BYTEA_P(result); -} - -Datum -bytea_smaller(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - bytea *result; - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - result = ((cmp < 0) || ((cmp == 0) && (len1 < len2)) ? arg1 : arg2); - - PG_RETURN_BYTEA_P(result); -} - -Datum -bytea_sortsupport(PG_FUNCTION_ARGS) -{ - SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); - MemoryContext oldcontext; - - oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); - - /* Use generic string SortSupport, forcing "C" collation */ - varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID); - - MemoryContextSwitchTo(oldcontext); - - PG_RETURN_VOID(); -} - -/* Cast bytea -> int2 */ -Datum -bytea_int2(PG_FUNCTION_ARGS) -{ - bytea *v = PG_GETARG_BYTEA_PP(0); - int len = VARSIZE_ANY_EXHDR(v); - uint16 result; - - /* Check that the byte array is not too long */ - if (len > sizeof(result)) - ereport(ERROR, - errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("smallint out of range")); - - /* Convert it to an integer; most significant bytes come first */ - result = 0; - for (int i = 0; i < len; i++) - { - result <<= BITS_PER_BYTE; - result |= ((unsigned char *) VARDATA_ANY(v))[i]; - } - - PG_RETURN_INT16(result); -} - -/* Cast bytea -> int4 */ -Datum -bytea_int4(PG_FUNCTION_ARGS) -{ - bytea *v = PG_GETARG_BYTEA_PP(0); - int len = VARSIZE_ANY_EXHDR(v); - uint32 result; - - /* Check that the byte array is not too long */ - if (len > sizeof(result)) - ereport(ERROR, - errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("integer out of range")); - - /* Convert it to an integer; most significant bytes come first */ - result = 0; - for (int i = 0; i < len; i++) - { - result <<= BITS_PER_BYTE; - result |= ((unsigned char *) VARDATA_ANY(v))[i]; - } - - PG_RETURN_INT32(result); -} - -/* Cast bytea -> int8 */ -Datum -bytea_int8(PG_FUNCTION_ARGS) -{ - bytea *v = PG_GETARG_BYTEA_PP(0); - int len = VARSIZE_ANY_EXHDR(v); - uint64 result; - - /* Check that the byte array is not too long */ - if (len > sizeof(result)) - ereport(ERROR, - errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("bigint out of range")); - - /* Convert it to an integer; most significant bytes come first */ - result = 0; - for (int i = 0; i < len; i++) - { - result <<= BITS_PER_BYTE; - result |= ((unsigned char *) VARDATA_ANY(v))[i]; - } - - PG_RETURN_INT64(result); -} - -/* Cast int2 -> bytea; can just use int2send() */ -Datum -int2_bytea(PG_FUNCTION_ARGS) -{ - return int2send(fcinfo); -} - -/* Cast int4 -> bytea; can just use int4send() */ -Datum -int4_bytea(PG_FUNCTION_ARGS) -{ - return int4send(fcinfo); -} - -/* Cast int8 -> bytea; can just use int8send() */ -Datum -int8_bytea(PG_FUNCTION_ARGS) -{ - return int8send(fcinfo); -} - /* * appendStringInfoText * diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index a4150bff2eaea..182e8f75db75c 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -529,14 +529,36 @@ xmltext(PG_FUNCTION_ARGS) #ifdef USE_LIBXML text *arg = PG_GETARG_TEXT_PP(0); text *result; - xmlChar *xmlbuf = NULL; + volatile xmlChar *xmlbuf = NULL; + PgXmlErrorContext *xmlerrcxt; + + /* First we gotta spin up some error handling. */ + xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); + + PG_TRY(); + { + xmlbuf = xmlEncodeSpecialChars(NULL, xml_text2xmlChar(arg)); + + if (xmlbuf == NULL || xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate xmlChar"); - xmlbuf = xmlEncodeSpecialChars(NULL, xml_text2xmlChar(arg)); + result = cstring_to_text_with_len((const char *) xmlbuf, + xmlStrlen((const xmlChar *) xmlbuf)); + } + PG_CATCH(); + { + if (xmlbuf) + xmlFree((xmlChar *) xmlbuf); - Assert(xmlbuf); + pg_xml_done(xmlerrcxt, true); + PG_RE_THROW(); + } + PG_END_TRY(); + + xmlFree((xmlChar *) xmlbuf); + pg_xml_done(xmlerrcxt, false); - result = cstring_to_text_with_len((const char *) xmlbuf, xmlStrlen(xmlbuf)); - xmlFree(xmlbuf); PG_RETURN_XML_P(result); #else NO_XML_SUPPORT(); @@ -663,7 +685,7 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent) volatile xmlBufferPtr buf = NULL; volatile xmlSaveCtxtPtr ctxt = NULL; ErrorSaveContext escontext = {T_ErrorSaveContext}; - PgXmlErrorContext *xmlerrcxt; + PgXmlErrorContext *volatile xmlerrcxt = NULL; #endif if (xmloption_arg != XMLOPTION_DOCUMENT && !indent) @@ -704,13 +726,18 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent) return (text *) data; } - /* Otherwise, we gotta spin up some error handling. */ - xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); - + /* + * Otherwise, we gotta spin up some error handling. Unlike most other + * routines in this module, we already have a libxml "doc" structure to + * free, so we need to call pg_xml_init() inside the PG_TRY and be + * prepared for it to fail (typically due to palloc OOM). + */ PG_TRY(); { size_t decl_len = 0; + xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); + /* The serialized data will go into this buffer. */ buf = xmlBufferCreate(); @@ -770,7 +797,10 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent) if (oldroot != NULL) xmlFreeNode(oldroot); - xmlAddChildList(root, content_nodes); + if (xmlAddChildList(root, content_nodes) == NULL || + xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INTERNAL_ERROR, + "could not append xml node list"); /* * We use this node to insert newlines in the dump. Note: in at @@ -838,10 +868,10 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent) xmlSaveClose(ctxt); if (buf) xmlBufferFree(buf); - if (doc) - xmlFreeDoc(doc); + xmlFreeDoc(doc); - pg_xml_done(xmlerrcxt, true); + if (xmlerrcxt) + pg_xml_done(xmlerrcxt, true); PG_RE_THROW(); } @@ -931,7 +961,10 @@ xmlelement(XmlExpr *xexpr, xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, "could not allocate xmlTextWriter"); - xmlTextWriterStartElement(writer, (xmlChar *) xexpr->name); + if (xmlTextWriterStartElement(writer, (xmlChar *) xexpr->name) < 0 || + xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INTERNAL_ERROR, + "could not start xml element"); forboth(arg, named_arg_strings, narg, xexpr->arg_names) { @@ -939,19 +972,30 @@ xmlelement(XmlExpr *xexpr, char *argname = strVal(lfirst(narg)); if (str) - xmlTextWriterWriteAttribute(writer, - (xmlChar *) argname, - (xmlChar *) str); + { + if (xmlTextWriterWriteAttribute(writer, + (xmlChar *) argname, + (xmlChar *) str) < 0 || + xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INTERNAL_ERROR, + "could not write xml attribute"); + } } foreach(arg, arg_strings) { char *str = (char *) lfirst(arg); - xmlTextWriterWriteRaw(writer, (xmlChar *) str); + if (xmlTextWriterWriteRaw(writer, (xmlChar *) str) < 0 || + xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INTERNAL_ERROR, + "could not write raw xml text"); } - xmlTextWriterEndElement(writer); + if (xmlTextWriterEndElement(writer) < 0 || + xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INTERNAL_ERROR, + "could not end xml element"); /* we MUST do this now to flush data out to the buffer ... */ xmlFreeTextWriter(writer); @@ -1725,7 +1769,7 @@ xml_doctype_in_content(const xmlChar *str) * xmloption_arg, but a DOCTYPE node in the input can force DOCUMENT mode). * * If parsed_nodes isn't NULL and we parse in CONTENT mode, the list - * of parsed nodes from the xmlParseInNodeContext call will be returned + * of parsed nodes from the xmlParseBalancedChunkMemory call will be returned * to *parsed_nodes. (It is caller's responsibility to free that.) * * Errors normally result in ereport(ERROR), but if escontext is an @@ -1751,6 +1795,7 @@ xml_parse(text *data, XmlOptionType xmloption_arg, PgXmlErrorContext *xmlerrcxt; volatile xmlParserCtxtPtr ctxt = NULL; volatile xmlDocPtr doc = NULL; + volatile int save_keep_blanks = -1; /* * This step looks annoyingly redundant, but we must do it to have a @@ -1778,7 +1823,6 @@ xml_parse(text *data, XmlOptionType xmloption_arg, PG_TRY(); { bool parse_as_document = false; - int options; int res_code; size_t count = 0; xmlChar *version = NULL; @@ -1809,18 +1853,6 @@ xml_parse(text *data, XmlOptionType xmloption_arg, parse_as_document = true; } - /* - * Select parse options. - * - * Note that here we try to apply DTD defaults (XML_PARSE_DTDATTR) - * according to SQL/XML:2008 GR 10.16.7.d: 'Default values defined by - * internal DTD are applied'. As for external DTDs, we try to support - * them too (see SQL/XML:2008 GR 10.16.7.e), but that doesn't really - * happen because xmlPgEntityLoader prevents it. - */ - options = XML_PARSE_NOENT | XML_PARSE_DTDATTR - | (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS); - /* initialize output parameters */ if (parsed_xmloptiontype != NULL) *parsed_xmloptiontype = parse_as_document ? XMLOPTION_DOCUMENT : @@ -1830,11 +1862,26 @@ xml_parse(text *data, XmlOptionType xmloption_arg, if (parse_as_document) { + int options; + + /* set up parser context used by xmlCtxtReadDoc */ ctxt = xmlNewParserCtxt(); if (ctxt == NULL || xmlerrcxt->err_occurred) xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, "could not allocate parser context"); + /* + * Select parse options. + * + * Note that here we try to apply DTD defaults (XML_PARSE_DTDATTR) + * according to SQL/XML:2008 GR 10.16.7.d: 'Default values defined + * by internal DTD are applied'. As for external DTDs, we try to + * support them too (see SQL/XML:2008 GR 10.16.7.e), but that + * doesn't really happen because xmlPgEntityLoader prevents it. + */ + options = XML_PARSE_NOENT | XML_PARSE_DTDATTR + | (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS); + doc = xmlCtxtReadDoc(ctxt, utf8string, NULL, /* no URL */ "UTF-8", @@ -1856,10 +1903,7 @@ xml_parse(text *data, XmlOptionType xmloption_arg, } else { - xmlNodePtr root; - xmlNodePtr oldroot PG_USED_FOR_ASSERTS_ONLY; - - /* set up document with empty root node to be the context node */ + /* set up document that xmlParseBalancedChunkMemory will add to */ doc = xmlNewDoc(version); if (doc == NULL || xmlerrcxt->err_occurred) xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, @@ -1872,43 +1916,22 @@ xml_parse(text *data, XmlOptionType xmloption_arg, "could not allocate XML document"); doc->standalone = standalone; - root = xmlNewNode(NULL, (const xmlChar *) "content-root"); - if (root == NULL || xmlerrcxt->err_occurred) - xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, - "could not allocate xml node"); - - /* - * This attaches root to doc, so we need not free it separately; - * and there can't yet be any old root to free. - */ - oldroot = xmlDocSetRootElement(doc, root); - Assert(oldroot == NULL); + /* set parse options --- have to do this the ugly way */ + save_keep_blanks = xmlKeepBlanksDefault(preserve_whitespace ? 1 : 0); /* allow empty content */ if (*(utf8string + count)) { - xmlNodePtr node_list = NULL; - xmlParserErrors res; - - res = xmlParseInNodeContext(root, - (char *) utf8string + count, - strlen((char *) utf8string + count), - options, - &node_list); - - if (res != XML_ERR_OK || xmlerrcxt->err_occurred) + res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0, + utf8string + count, + parsed_nodes); + if (res_code != 0 || xmlerrcxt->err_occurred) { - xmlFreeNodeList(node_list); xml_errsave(escontext, xmlerrcxt, ERRCODE_INVALID_XML_CONTENT, "invalid XML content"); goto fail; } - - if (parsed_nodes != NULL) - *parsed_nodes = node_list; - else - xmlFreeNodeList(node_list); } } @@ -1917,6 +1940,8 @@ xml_parse(text *data, XmlOptionType xmloption_arg, } PG_CATCH(); { + if (save_keep_blanks != -1) + xmlKeepBlanksDefault(save_keep_blanks); if (doc != NULL) xmlFreeDoc(doc); if (ctxt != NULL) @@ -1928,6 +1953,9 @@ xml_parse(text *data, XmlOptionType xmloption_arg, } PG_END_TRY(); + if (save_keep_blanks != -1) + xmlKeepBlanksDefault(save_keep_blanks); + if (ctxt != NULL) xmlFreeParserCtxt(ctxt); @@ -4220,20 +4248,27 @@ xml_xmlnodetoxmltype(xmlNodePtr cur, PgXmlErrorContext *xmlerrcxt) } else { - xmlChar *str; + volatile xmlChar *str = NULL; - str = xmlXPathCastNodeToString(cur); PG_TRY(); { + char *escaped; + + str = xmlXPathCastNodeToString(cur); + if (str == NULL || xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate xmlChar"); + /* Here we rely on XML having the same representation as TEXT */ - char *escaped = escape_xml((char *) str); + escaped = escape_xml((char *) str); result = (xmltype *) cstring_to_text(escaped); pfree(escaped); } PG_FINALLY(); { - xmlFree(str); + if (str) + xmlFree((xmlChar *) str); } PG_END_TRY(); } diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c index 657648996c235..d1b25214376ed 100644 --- a/src/backend/utils/cache/catcache.c +++ b/src/backend/utils/cache/catcache.c @@ -317,6 +317,7 @@ GetCCHashEqFuncs(Oid keytype, CCHashFN *hashfunc, RegProcedure *eqfunc, CCFastEq case REGDICTIONARYOID: case REGROLEOID: case REGNAMESPACEOID: + case REGDATABASEOID: *hashfunc = int4hashfast; *fasteqfunc = int4eqfast; *eqfunc = F_OIDEQ; diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c index 89a1c79e984d1..0c506d320b137 100644 --- a/src/backend/utils/cache/plancache.c +++ b/src/backend/utils/cache/plancache.c @@ -1283,6 +1283,7 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams, CachedPlan *plan = NULL; List *qlist; bool customplan; + ListCell *lc; /* Assert caller is doing things in a sane order */ Assert(plansource->magic == CACHEDPLANSOURCE_MAGIC); @@ -1385,6 +1386,13 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams, plan->is_saved = true; } + foreach(lc, plan->stmt_list) + { + PlannedStmt *pstmt = (PlannedStmt *) lfirst(lc); + + pstmt->planOrigin = customplan ? PLAN_STMT_CACHE_CUSTOM : PLAN_STMT_CACHE_GENERIC; + } + return plan; } diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c index 1ad155d446e51..42e9be274fc6a 100644 --- a/src/backend/utils/hash/dynahash.c +++ b/src/backend/utils/hash/dynahash.c @@ -195,6 +195,7 @@ struct HASHHDR long ssize; /* segment size --- must be power of 2 */ int sshift; /* segment shift = log2(ssize) */ int nelem_alloc; /* number of entries to allocate at once */ + bool isfixed; /* if true, don't enlarge */ #ifdef HASH_STATISTICS @@ -227,7 +228,6 @@ struct HTAB MemoryContext hcxt; /* memory context if default allocator used */ char *tabname; /* table name (for error messages) */ bool isshared; /* true if table is in shared memory */ - bool isfixed; /* if true, don't enlarge */ /* freezing a shared table isn't allowed, so we can keep state here */ bool frozen; /* true = no more inserts allowed */ @@ -618,8 +618,10 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags) } } + /* Set isfixed if requested, but not till after we build initial entries */ if (flags & HASH_FIXED_SIZE) - hashp->isfixed = true; + hctl->isfixed = true; + return hashp; } @@ -644,6 +646,8 @@ hdefault(HTAB *hashp) hctl->ssize = DEF_SEGSIZE; hctl->sshift = DEF_SEGSIZE_SHIFT; + hctl->isfixed = false; /* can be enlarged */ + #ifdef HASH_STATISTICS hctl->accesses = hctl->collisions = 0; #endif @@ -1713,7 +1717,7 @@ element_alloc(HTAB *hashp, int nelem, int freelist_idx) HASHELEMENT *prevElement; int i; - if (hashp->isfixed) + if (hctl->isfixed) return false; /* Each element has a HASHELEMENT header plus user data. */ diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index c86ceefda940b..641e535a73c7c 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -417,12 +417,11 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datctype); ctype = TextDatumGetCString(datum); - if (pg_perm_setlocale(LC_COLLATE, collate) == NULL) - ereport(FATAL, - (errmsg("database locale is incompatible with operating system"), - errdetail("The database was initialized with LC_COLLATE \"%s\", " - " which is not recognized by setlocale().", collate), - errhint("Recreate the database with another locale or install the missing locale."))); + /* + * Historcally, we set LC_COLLATE from datcollate, as well. That's no + * longer necessary because all collation behavior is handled through + * pg_locale_t. + */ if (pg_perm_setlocale(LC_CTYPE, ctype) == NULL) ereport(FATAL, diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 511dc32d51921..d14b1678e7fec 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -3081,7 +3081,7 @@ struct config_int ConfigureNamesInt[] = }, &max_slot_wal_keep_size_mb, -1, -1, MAX_KILOBYTES, - check_max_slot_wal_keep_size, NULL, NULL + NULL, NULL, NULL }, { @@ -3100,11 +3100,11 @@ struct config_int ConfigureNamesInt[] = gettext_noop("Sets the duration a replication slot can remain idle before " "it is invalidated."), NULL, - GUC_UNIT_MIN + GUC_UNIT_S }, - &idle_replication_slot_timeout_mins, - 0, 0, INT_MAX / SECS_PER_MINUTE, - check_idle_replication_slot_timeout, NULL, NULL + &idle_replication_slot_timeout_secs, + 0, 0, INT_MAX, + NULL, NULL, NULL }, { diff --git a/src/backend/utils/misc/injection_point.c b/src/backend/utils/misc/injection_point.c index f58ebc8ee522d..83b887b697807 100644 --- a/src/backend/utils/misc/injection_point.c +++ b/src/backend/utils/misc/injection_point.c @@ -584,3 +584,49 @@ IsInjectionPointAttached(const char *name) return false; /* silence compiler */ #endif } + +/* + * Retrieve a list of all the injection points currently attached. + * + * This list is palloc'd in the current memory context. + */ +List * +InjectionPointList(void) +{ +#ifdef USE_INJECTION_POINTS + List *inj_points = NIL; + uint32 max_inuse; + + LWLockAcquire(InjectionPointLock, LW_SHARED); + + max_inuse = pg_atomic_read_u32(&ActiveInjectionPoints->max_inuse); + + for (uint32 idx = 0; idx < max_inuse; idx++) + { + InjectionPointEntry *entry; + InjectionPointData *inj_point; + uint64 generation; + + entry = &ActiveInjectionPoints->entries[idx]; + generation = pg_atomic_read_u64(&entry->generation); + + /* skip free slots */ + if (generation % 2 == 0) + continue; + + inj_point = (InjectionPointData *) palloc0(sizeof(InjectionPointData)); + inj_point->name = pstrdup(entry->name); + inj_point->library = pstrdup(entry->library); + inj_point->function = pstrdup(entry->function); + inj_points = lappend(inj_points, inj_point); + } + + LWLockRelease(InjectionPointLock); + + return inj_points; + +#else + elog(ERROR, "Injection points are not supported by this build"); + return NIL; /* keep compiler quiet */ +#endif +} diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 341f88adc87b2..a9d8293474af5 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -342,7 +342,7 @@ # (change requires restart) #wal_keep_size = 0 # in megabytes; 0 disables #max_slot_wal_keep_size = -1 # in megabytes; -1 disables -#idle_replication_slot_timeout = 0 # in minutes; 0 disables +#idle_replication_slot_timeout = 0 # in seconds; 0 disables #wal_sender_timeout = 60s # in milliseconds; 0 disables #track_commit_timestamp = off # collect timestamp of transaction commit # (change requires restart) diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c index 17d4f7a7a06e1..be43e9351c3d3 100644 --- a/src/backend/utils/mmgr/dsa.c +++ b/src/backend/utils/mmgr/dsa.c @@ -531,6 +531,21 @@ dsa_attach(dsa_handle handle) return area; } +/* + * Returns whether the area with the given handle was already attached by the + * current process. The area must have been created with dsa_create (not + * dsa_create_in_place). + */ +bool +dsa_is_attached(dsa_handle handle) +{ + /* + * An area handle is really a DSM segment handle for the first segment, so + * we can just search for that. + */ + return dsm_find_mapping(handle) != NULL; +} + /* * Attach to an area that was created with dsa_create_in_place. The caller * must somehow know the location in memory that was used when the area was diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c index 15fa4d0a55eeb..ce01dce9861da 100644 --- a/src/backend/utils/mmgr/mcxt.c +++ b/src/backend/utils/mmgr/mcxt.c @@ -560,9 +560,7 @@ MemoryContextDeleteChildren(MemoryContext context) * the specified context, since that means it will automatically be freed * when no longer needed. * - * There is no API for deregistering a callback once registered. If you - * want it to not do anything anymore, adjust the state pointed to by its - * "arg" to indicate that. + * Note that callers can assume this cannot fail. */ void MemoryContextRegisterResetCallback(MemoryContext context, @@ -577,6 +575,41 @@ MemoryContextRegisterResetCallback(MemoryContext context, context->isReset = false; } +/* + * MemoryContextUnregisterResetCallback + * Undo the effects of MemoryContextRegisterResetCallback. + * + * This can be used if a callback's effects are no longer required + * at some point before the context has been reset/deleted. It is the + * caller's responsibility to pfree the callback struct (if needed). + * + * An assertion failure occurs if the callback was not registered. + * We could alternatively define that case as a no-op, but that seems too + * likely to mask programming errors such as passing the wrong context. + */ +void +MemoryContextUnregisterResetCallback(MemoryContext context, + MemoryContextCallback *cb) +{ + MemoryContextCallback *prev, + *cur; + + Assert(MemoryContextIsValid(context)); + + for (prev = NULL, cur = context->reset_cbs; cur != NULL; + prev = cur, cur = cur->next) + { + if (cur != cb) + continue; + if (prev) + prev->next = cur->next; + else + context->reset_cbs = cur->next; + return; + } + Assert(false); +} + /* * MemoryContextCallResetCallbacks * Internal function to call all registered callbacks for context. diff --git a/src/bin/initdb/Makefile b/src/bin/initdb/Makefile index 997e0a013e956..c0470efda92a3 100644 --- a/src/bin/initdb/Makefile +++ b/src/bin/initdb/Makefile @@ -20,7 +20,7 @@ include $(top_builddir)/src/Makefile.global # from libpq, else we have risks of version skew if we run with a libpq # shared library from a different PG version. Define # USE_PRIVATE_ENCODING_FUNCS to ensure that that happens. -override CPPFLAGS := -DUSE_PRIVATE_ENCODING_FUNCS -I$(libpq_srcdir) -I$(top_srcdir)/src/timezone $(ICU_CFLAGS) $(CPPFLAGS) +override CPPFLAGS := -DUSE_PRIVATE_ENCODING_FUNCS -I$(libpq_srcdir) -I$(top_srcdir)/src/timezone $(CPPFLAGS) $(ICU_CFLAGS) # We need libpq only because fe_utils does. LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport) $(ICU_LIBS) diff --git a/src/bin/pg_amcheck/t/004_verify_heapam.pl b/src/bin/pg_amcheck/t/004_verify_heapam.pl index 2a3af2666f52a..72693660fb64b 100644 --- a/src/bin/pg_amcheck/t/004_verify_heapam.pl +++ b/src/bin/pg_amcheck/t/004_verify_heapam.pl @@ -529,7 +529,7 @@ sub header $tup->{t_infomask2} |= HEAP_NATTS_MASK; push @expected, - qr/${$header}number of attributes 2047 exceeds maximum expected for table 3/; + qr/${$header}number of attributes 2047 exceeds maximum 3 expected for table/; } elsif ($offnum == 10) { @@ -552,7 +552,7 @@ sub header $tup->{t_hoff} = 32; push @expected, - qr/${$header}number of attributes 67 exceeds maximum expected for table 3/; + qr/${$header}number of attributes 67 exceeds maximum 3 expected for table/; } elsif ($offnum == 12) { diff --git a/src/bin/pg_basebackup/meson.build b/src/bin/pg_basebackup/meson.build index 8a1c96b4f5c84..3a7fc10eab02f 100644 --- a/src/bin/pg_basebackup/meson.build +++ b/src/bin/pg_basebackup/meson.build @@ -93,9 +93,9 @@ tests += { 'sd': meson.current_source_dir(), 'bd': meson.current_build_dir(), 'tap': { - 'env': {'GZIP_PROGRAM': gzip.found() ? gzip.path() : '', - 'TAR': tar.found() ? tar.path() : '', - 'LZ4': program_lz4.found() ? program_lz4.path() : '', + 'env': {'GZIP_PROGRAM': gzip.found() ? gzip.full_path() : '', + 'TAR': tar.found() ? tar.full_path() : '', + 'LZ4': program_lz4.found() ? program_lz4.full_path() : '', }, 'tests': [ 't/010_pg_basebackup.pl', diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index eb7354200bcee..55621f35fb6b7 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -487,7 +487,7 @@ reached_end_position(XLogRecPtr segendpos, uint32 timeline, if (r < 0) pg_fatal("could not read from ready pipe: %m"); - if (sscanf(xlogend, "%X/%X", &hi, &lo) != 2) + if (sscanf(xlogend, "%X/%08X", &hi, &lo) != 2) pg_fatal("could not parse write-ahead log location \"%s\"", xlogend); xlogendptr = ((uint64) hi) << 32 | lo; @@ -629,7 +629,7 @@ StartLogStreamer(char *startpos, uint32 timeline, char *sysidentifier, param->wal_compress_level = wal_compress_level; /* Convert the starting position */ - if (sscanf(startpos, "%X/%X", &hi, &lo) != 2) + if (sscanf(startpos, "%X/%08X", &hi, &lo) != 2) pg_fatal("could not parse write-ahead log location \"%s\"", startpos); param->startptr = ((uint64) hi) << 32 | lo; @@ -2255,7 +2255,7 @@ BaseBackup(char *compression_algorithm, char *compression_detail, * value directly in the variable, and then set the flag that says * it's there. */ - if (sscanf(xlogend, "%X/%X", &hi, &lo) != 2) + if (sscanf(xlogend, "%X/%08X", &hi, &lo) != 2) pg_fatal("could not parse write-ahead log location \"%s\"", xlogend); xlogendptr = ((uint64) hi) << 32 | lo; diff --git a/src/bin/pg_basebackup/pg_createsubscriber.c b/src/bin/pg_basebackup/pg_createsubscriber.c index 11f71c0380181..3986882f04292 100644 --- a/src/bin/pg_basebackup/pg_createsubscriber.c +++ b/src/bin/pg_basebackup/pg_createsubscriber.c @@ -1250,8 +1250,17 @@ setup_recovery(const struct LogicalRepInfo *dbinfo, const char *datadir, const c appendPQExpBufferStr(recoveryconfcontents, "recovery_target = ''\n"); appendPQExpBufferStr(recoveryconfcontents, "recovery_target_timeline = 'latest'\n"); + + /* + * Set recovery_target_inclusive = false to avoid reapplying the + * transaction committed at 'lsn' after subscription is enabled. This is + * because the provided 'lsn' is also used as the replication start point + * for the subscription. So, the server can send the transaction committed + * at that 'lsn' after replication is started which can lead to applying + * the same transaction twice if we keep recovery_target_inclusive = true. + */ appendPQExpBufferStr(recoveryconfcontents, - "recovery_target_inclusive = true\n"); + "recovery_target_inclusive = false\n"); appendPQExpBufferStr(recoveryconfcontents, "recovery_target_action = promote\n"); appendPQExpBufferStr(recoveryconfcontents, "recovery_target_name = ''\n"); @@ -1262,7 +1271,7 @@ setup_recovery(const struct LogicalRepInfo *dbinfo, const char *datadir, const c { appendPQExpBufferStr(recoveryconfcontents, "# dry run mode"); appendPQExpBuffer(recoveryconfcontents, - "recovery_target_lsn = '%X/%X'\n", + "recovery_target_lsn = '%X/%08X'\n", LSN_FORMAT_ARGS((XLogRecPtr) InvalidXLogRecPtr)); } else @@ -1876,7 +1885,7 @@ set_replication_progress(PGconn *conn, const struct LogicalRepInfo *dbinfo, cons if (dry_run) { suboid = InvalidOid; - lsnstr = psprintf("%X/%X", LSN_FORMAT_ARGS((XLogRecPtr) InvalidXLogRecPtr)); + lsnstr = psprintf("%X/%08X", LSN_FORMAT_ARGS((XLogRecPtr) InvalidXLogRecPtr)); } else { diff --git a/src/bin/pg_basebackup/pg_receivewal.c b/src/bin/pg_basebackup/pg_receivewal.c index e816cf58101fb..289ca14dcfe58 100644 --- a/src/bin/pg_basebackup/pg_receivewal.c +++ b/src/bin/pg_basebackup/pg_receivewal.c @@ -188,14 +188,14 @@ stop_streaming(XLogRecPtr xlogpos, uint32 timeline, bool segment_finished) /* we assume that we get called once at the end of each segment */ if (verbose && segment_finished) - pg_log_info("finished segment at %X/%X (timeline %u)", + pg_log_info("finished segment at %X/%08X (timeline %u)", LSN_FORMAT_ARGS(xlogpos), timeline); if (!XLogRecPtrIsInvalid(endpos) && endpos < xlogpos) { if (verbose) - pg_log_info("stopped log streaming at %X/%X (timeline %u)", + pg_log_info("stopped log streaming at %X/%08X (timeline %u)", LSN_FORMAT_ARGS(xlogpos), timeline); time_to_stop = true; @@ -211,7 +211,7 @@ stop_streaming(XLogRecPtr xlogpos, uint32 timeline, bool segment_finished) * timeline, but it's close enough for reporting purposes. */ if (verbose && prevtimeline != 0 && prevtimeline != timeline) - pg_log_info("switched to timeline %u at %X/%X", + pg_log_info("switched to timeline %u at %X/%08X", timeline, LSN_FORMAT_ARGS(prevpos)); @@ -575,7 +575,7 @@ StreamLog(void) * Start the replication */ if (verbose) - pg_log_info("starting log streaming at %X/%X (timeline %u)", + pg_log_info("starting log streaming at %X/%08X (timeline %u)", LSN_FORMAT_ARGS(stream.startpos), stream.timeline); @@ -689,7 +689,7 @@ main(int argc, char **argv) basedir = pg_strdup(optarg); break; case 'E': - if (sscanf(optarg, "%X/%X", &hi, &lo) != 2) + if (sscanf(optarg, "%X/%08X", &hi, &lo) != 2) pg_fatal("could not parse end position \"%s\"", optarg); endpos = ((uint64) hi) << 32 | lo; break; diff --git a/src/bin/pg_basebackup/pg_recvlogical.c b/src/bin/pg_basebackup/pg_recvlogical.c index fb7a6a1d05d8d..8a5dd24e6c9ad 100644 --- a/src/bin/pg_basebackup/pg_recvlogical.c +++ b/src/bin/pg_basebackup/pg_recvlogical.c @@ -144,7 +144,7 @@ sendFeedback(PGconn *conn, TimestampTz now, bool force, bool replyRequested) return true; if (verbose) - pg_log_info("confirming write up to %X/%X, flush to %X/%X (slot %s)", + pg_log_info("confirming write up to %X/%08X, flush to %X/%08X (slot %s)", LSN_FORMAT_ARGS(output_written_lsn), LSN_FORMAT_ARGS(output_fsync_lsn), replication_slot); @@ -238,13 +238,13 @@ StreamLogicalLog(void) * Start the replication */ if (verbose) - pg_log_info("starting log streaming at %X/%X (slot %s)", + pg_log_info("starting log streaming at %X/%08X (slot %s)", LSN_FORMAT_ARGS(startpos), replication_slot); /* Initiate the replication stream at specified location */ query = createPQExpBuffer(); - appendPQExpBuffer(query, "START_REPLICATION SLOT \"%s\" LOGICAL %X/%X", + appendPQExpBuffer(query, "START_REPLICATION SLOT \"%s\" LOGICAL %X/%08X", replication_slot, LSN_FORMAT_ARGS(startpos)); /* print options if there are any */ @@ -800,12 +800,12 @@ main(int argc, char **argv) break; /* replication options */ case 'I': - if (sscanf(optarg, "%X/%X", &hi, &lo) != 2) + if (sscanf(optarg, "%X/%08X", &hi, &lo) != 2) pg_fatal("could not parse start position \"%s\"", optarg); startpos = ((uint64) hi) << 32 | lo; break; case 'E': - if (sscanf(optarg, "%X/%X", &hi, &lo) != 2) + if (sscanf(optarg, "%X/%08X", &hi, &lo) != 2) pg_fatal("could not parse end position \"%s\"", optarg); endpos = ((uint64) hi) << 32 | lo; break; @@ -1075,12 +1075,12 @@ prepareToTerminate(PGconn *conn, XLogRecPtr endpos, StreamStopReason reason, pg_log_info("received interrupt signal, exiting"); break; case STREAM_STOP_KEEPALIVE: - pg_log_info("end position %X/%X reached by keepalive", + pg_log_info("end position %X/%08X reached by keepalive", LSN_FORMAT_ARGS(endpos)); break; case STREAM_STOP_END_OF_WAL: Assert(!XLogRecPtrIsInvalid(lsn)); - pg_log_info("end position %X/%X reached by WAL record at %X/%X", + pg_log_info("end position %X/%08X reached by WAL record at %X/%08X", LSN_FORMAT_ARGS(endpos), LSN_FORMAT_ARGS(lsn)); break; case STREAM_STOP_NONE: diff --git a/src/bin/pg_basebackup/receivelog.c b/src/bin/pg_basebackup/receivelog.c index 6b6e32dfbdf56..d6b7f117fa3bb 100644 --- a/src/bin/pg_basebackup/receivelog.c +++ b/src/bin/pg_basebackup/receivelog.c @@ -571,7 +571,7 @@ ReceiveXlogStream(PGconn *conn, StreamCtl *stream) return true; /* Initiate the replication stream at specified location */ - snprintf(query, sizeof(query), "START_REPLICATION %s%X/%X TIMELINE %u", + snprintf(query, sizeof(query), "START_REPLICATION %s%X/%08X TIMELINE %u", slotcmd, LSN_FORMAT_ARGS(stream->startpos), stream->timeline); @@ -628,7 +628,7 @@ ReceiveXlogStream(PGconn *conn, StreamCtl *stream) } if (stream->startpos > stoppos) { - pg_log_error("server stopped streaming timeline %u at %X/%X, but reported next timeline %u to begin at %X/%X", + pg_log_error("server stopped streaming timeline %u at %X/%08X, but reported next timeline %u to begin at %X/%08X", stream->timeline, LSN_FORMAT_ARGS(stoppos), newtimeline, LSN_FORMAT_ARGS(stream->startpos)); goto error; @@ -720,7 +720,7 @@ ReadEndOfStreamingResult(PGresult *res, XLogRecPtr *startpos, uint32 *timeline) } *timeline = atoi(PQgetvalue(res, 0, 0)); - if (sscanf(PQgetvalue(res, 0, 1), "%X/%X", &startpos_xlogid, + if (sscanf(PQgetvalue(res, 0, 1), "%X/%08X", &startpos_xlogid, &startpos_xrecoff) != 2) { pg_log_error("could not parse next timeline's starting point \"%s\"", diff --git a/src/bin/pg_basebackup/streamutil.c b/src/bin/pg_basebackup/streamutil.c index c7b8a4c3a4b6a..e5a7cb6e5b14e 100644 --- a/src/bin/pg_basebackup/streamutil.c +++ b/src/bin/pg_basebackup/streamutil.c @@ -445,7 +445,7 @@ RunIdentifySystem(PGconn *conn, char **sysid, TimeLineID *starttli, /* Get LSN start position if necessary */ if (startpos != NULL) { - if (sscanf(PQgetvalue(res, 0, 2), "%X/%X", &hi, &lo) != 2) + if (sscanf(PQgetvalue(res, 0, 2), "%X/%08X", &hi, &lo) != 2) { pg_log_error("could not parse write-ahead log location \"%s\"", PQgetvalue(res, 0, 2)); @@ -551,7 +551,7 @@ GetSlotInformation(PGconn *conn, const char *slot_name, uint32 hi, lo; - if (sscanf(PQgetvalue(res, 0, 1), "%X/%X", &hi, &lo) != 2) + if (sscanf(PQgetvalue(res, 0, 1), "%X/%08X", &hi, &lo) != 2) { pg_log_error("could not parse restart_lsn \"%s\" for replication slot \"%s\"", PQgetvalue(res, 0, 1), slot_name); diff --git a/src/bin/pg_combinebackup/backup_label.c b/src/bin/pg_combinebackup/backup_label.c index e89d4603f09dc..e774bc78a6264 100644 --- a/src/bin/pg_combinebackup/backup_label.c +++ b/src/bin/pg_combinebackup/backup_label.c @@ -247,7 +247,7 @@ parse_lsn(char *s, char *e, XLogRecPtr *lsn, char **c) unsigned lo; *e = '\0'; - success = (sscanf(s, "%X/%X%n", &hi, &lo, &nchars) == 2); + success = (sscanf(s, "%X/%08X%n", &hi, &lo, &nchars) == 2); *e = save; if (success) diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c index 28e58cd8ef458..f5cef99f62730 100644 --- a/src/bin/pg_combinebackup/pg_combinebackup.c +++ b/src/bin/pg_combinebackup/pg_combinebackup.c @@ -569,7 +569,7 @@ check_backup_label_files(int n_backups, char **backup_dirs) pg_fatal("backup at \"%s\" starts on timeline %u, but expected %u", backup_dirs[i], start_tli, check_tli); if (i < n_backups - 1 && start_lsn != check_lsn) - pg_fatal("backup at \"%s\" starts at LSN %X/%X, but expected %X/%X", + pg_fatal("backup at \"%s\" starts at LSN %X/%08X, but expected %X/%08X", backup_dirs[i], LSN_FORMAT_ARGS(start_lsn), LSN_FORMAT_ARGS(check_lsn)); diff --git a/src/bin/pg_combinebackup/write_manifest.c b/src/bin/pg_combinebackup/write_manifest.c index 313f8929df509..819a3fd0b7a6b 100644 --- a/src/bin/pg_combinebackup/write_manifest.c +++ b/src/bin/pg_combinebackup/write_manifest.c @@ -155,7 +155,7 @@ finalize_manifest(manifest_writer *mwriter, for (wal_range = first_wal_range; wal_range != NULL; wal_range = wal_range->next) appendStringInfo(&mwriter->buf, - "%s{ \"Timeline\": %u, \"Start-LSN\": \"%X/%X\", \"End-LSN\": \"%X/%X\" }", + "%s{ \"Timeline\": %u, \"Start-LSN\": \"%X/%08X\", \"End-LSN\": \"%X/%08X\" }", wal_range == first_wal_range ? "" : ",\n", wal_range->tli, LSN_FORMAT_ARGS(wal_range->start_lsn), diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 7bb801bb88612..10de058ce91f4 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -245,9 +245,9 @@ main(int argc, char *argv[]) dbState(ControlFile->state)); printf(_("pg_control last modified: %s\n"), pgctime_str); - printf(_("Latest checkpoint location: %X/%X\n"), + printf(_("Latest checkpoint location: %X/%08X\n"), LSN_FORMAT_ARGS(ControlFile->checkPoint)); - printf(_("Latest checkpoint's REDO location: %X/%X\n"), + printf(_("Latest checkpoint's REDO location: %X/%08X\n"), LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo)); printf(_("Latest checkpoint's REDO WAL file: %s\n"), xlogfilename); @@ -282,15 +282,15 @@ main(int argc, char *argv[]) ControlFile->checkPointCopy.newestCommitTsXid); printf(_("Time of latest checkpoint: %s\n"), ckpttime_str); - printf(_("Fake LSN counter for unlogged rels: %X/%X\n"), + printf(_("Fake LSN counter for unlogged rels: %X/%08X\n"), LSN_FORMAT_ARGS(ControlFile->unloggedLSN)); - printf(_("Minimum recovery ending location: %X/%X\n"), + printf(_("Minimum recovery ending location: %X/%08X\n"), LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint)); printf(_("Min recovery ending loc's timeline: %u\n"), ControlFile->minRecoveryPointTLI); - printf(_("Backup start location: %X/%X\n"), + printf(_("Backup start location: %X/%08X\n"), LSN_FORMAT_ARGS(ControlFile->backupStartPoint)); - printf(_("Backup end location: %X/%X\n"), + printf(_("Backup end location: %X/%08X\n"), LSN_FORMAT_ARGS(ControlFile->backupEndPoint)); printf(_("End-of-backup record required: %s\n"), ControlFile->backupEndRequired ? _("yes") : _("no")); diff --git a/src/bin/pg_dump/common.c b/src/bin/pg_dump/common.c index aa1589e3331d2..a1976fae607d6 100644 --- a/src/bin/pg_dump/common.c +++ b/src/bin/pg_dump/common.c @@ -17,6 +17,7 @@ #include +#include "catalog/pg_am_d.h" #include "catalog/pg_class_d.h" #include "catalog/pg_collation_d.h" #include "catalog/pg_extension_d.h" @@ -944,6 +945,24 @@ findOprByOid(Oid oid) return (OprInfo *) dobj; } +/* + * findAccessMethodByOid + * finds the DumpableObject for the access method with the given oid + * returns NULL if not found + */ +AccessMethodInfo * +findAccessMethodByOid(Oid oid) +{ + CatalogId catId; + DumpableObject *dobj; + + catId.tableoid = AccessMethodRelationId; + catId.oid = oid; + dobj = findObjectByCatalogId(catId); + Assert(dobj == NULL || dobj->objType == DO_ACCESS_METHOD); + return (AccessMethodInfo *) dobj; +} + /* * findCollationByOid * finds the DumpableObject for the collation with the given oid diff --git a/src/bin/pg_dump/meson.build b/src/bin/pg_dump/meson.build index d8e9e101254b1..a2233b0a1b431 100644 --- a/src/bin/pg_dump/meson.build +++ b/src/bin/pg_dump/meson.build @@ -91,9 +91,9 @@ tests += { 'bd': meson.current_build_dir(), 'tap': { 'env': { - 'GZIP_PROGRAM': gzip.found() ? gzip.path() : '', - 'LZ4': program_lz4.found() ? program_lz4.path() : '', - 'ZSTD': program_zstd.found() ? program_zstd.path() : '', + 'GZIP_PROGRAM': gzip.found() ? gzip.full_path() : '', + 'LZ4': program_lz4.found() ? program_lz4.full_path() : '', + 'ZSTD': program_zstd.found() ? program_zstd.full_path() : '', 'with_icu': icu.found() ? 'yes' : 'no', }, 'tests': [ @@ -102,7 +102,6 @@ tests += { 't/003_pg_dump_with_server.pl', 't/004_pg_dump_parallel.pl', 't/005_pg_dump_filterfile.pl', - 't/006_pg_dumpall.pl', 't/010_dump_connstr.pl', ], }, diff --git a/src/bin/pg_dump/parallel.c b/src/bin/pg_dump/parallel.c index 5974d6706fd57..086adcdc50295 100644 --- a/src/bin/pg_dump/parallel.c +++ b/src/bin/pg_dump/parallel.c @@ -333,16 +333,6 @@ on_exit_close_archive(Archive *AHX) on_exit_nicely(archive_close_connection, &shutdown_info); } -/* - * When pg_restore restores multiple databases, then update already added entry - * into array for cleanup. - */ -void -replace_on_exit_close_archive(Archive *AHX) -{ - shutdown_info.AHX = AHX; -} - /* * on_exit_nicely handler for shutting down database connections and * worker processes cleanly. diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h index af0007fb6d2f1..4ebef1e864451 100644 --- a/src/bin/pg_dump/pg_backup.h +++ b/src/bin/pg_dump/pg_backup.h @@ -308,7 +308,7 @@ extern void SetArchiveOptions(Archive *AH, DumpOptions *dopt, RestoreOptions *ro extern void ProcessArchiveRestoreOptions(Archive *AHX); -extern void RestoreArchive(Archive *AHX, bool append_data); +extern void RestoreArchive(Archive *AHX); /* Open an existing archive */ extern Archive *OpenArchive(const char *FileSpec, const ArchiveFormat fmt); diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c index 197c1295d93fd..dce88f040ace3 100644 --- a/src/bin/pg_dump/pg_backup_archiver.c +++ b/src/bin/pg_dump/pg_backup_archiver.c @@ -31,6 +31,8 @@ #endif #include "catalog/pg_class_d.h" +#include "catalog/pg_largeobject_metadata_d.h" +#include "catalog/pg_shdepend_d.h" #include "common/string.h" #include "compress_io.h" #include "dumputils.h" @@ -85,7 +87,7 @@ static int RestoringToDB(ArchiveHandle *AH); static void dump_lo_buf(ArchiveHandle *AH); static void dumpTimestamp(ArchiveHandle *AH, const char *msg, time_t tim); static void SetOutput(ArchiveHandle *AH, const char *filename, - const pg_compress_specification compression_spec, bool append_data); + const pg_compress_specification compression_spec); static CompressFileHandle *SaveOutput(ArchiveHandle *AH); static void RestoreOutput(ArchiveHandle *AH, CompressFileHandle *savedOutput); @@ -337,14 +339,9 @@ ProcessArchiveRestoreOptions(Archive *AHX) StrictNamesCheck(ropt); } -/* - * RestoreArchive - * - * If append_data is set, then append data into file as we are restoring dump - * of multiple databases which was taken by pg_dumpall. - */ +/* Public */ void -RestoreArchive(Archive *AHX, bool append_data) +RestoreArchive(Archive *AHX) { ArchiveHandle *AH = (ArchiveHandle *) AHX; RestoreOptions *ropt = AH->public.ropt; @@ -461,7 +458,7 @@ RestoreArchive(Archive *AHX, bool append_data) */ sav = SaveOutput(AH); if (ropt->filename || ropt->compression_spec.algorithm != PG_COMPRESSION_NONE) - SetOutput(AH, ropt->filename, ropt->compression_spec, append_data); + SetOutput(AH, ropt->filename, ropt->compression_spec); ahprintf(AH, "--\n-- PostgreSQL database dump\n--\n\n"); @@ -1300,7 +1297,7 @@ PrintTOCSummary(Archive *AHX) sav = SaveOutput(AH); if (ropt->filename) - SetOutput(AH, ropt->filename, out_compression_spec, false); + SetOutput(AH, ropt->filename, out_compression_spec); if (strftime(stamp_str, sizeof(stamp_str), PGDUMP_STRFTIME_FMT, localtime(&AH->createDate)) == 0) @@ -1679,8 +1676,7 @@ archprintf(Archive *AH, const char *fmt,...) static void SetOutput(ArchiveHandle *AH, const char *filename, - const pg_compress_specification compression_spec, - bool append_data) + const pg_compress_specification compression_spec) { CompressFileHandle *CFH; const char *mode; @@ -1700,7 +1696,7 @@ SetOutput(ArchiveHandle *AH, const char *filename, else fn = fileno(stdout); - if (append_data || AH->mode == archModeAppend) + if (AH->mode == archModeAppend) mode = PG_BINARY_A; else mode = PG_BINARY_W; @@ -2974,6 +2970,19 @@ _tocEntryRequired(TocEntry *te, teSection curSection, ArchiveHandle *AH) int res = REQ_SCHEMA | REQ_DATA; RestoreOptions *ropt = AH->public.ropt; + /* + * For binary upgrade mode, dump pg_largeobject_metadata and the + * associated pg_shdepend rows. This is faster to restore than the + * equivalent set of large object commands. We can only do this for + * upgrades from v12 and newer; in older versions, pg_largeobject_metadata + * was created WITH OIDS, so the OID column is hidden and won't be dumped. + */ + if (ropt->binary_upgrade && AH->public.remoteVersion >= 120000 && + strcmp(te->desc, "TABLE DATA") == 0 && + (te->catalogId.oid == LargeObjectMetadataRelationId || + te->catalogId.oid == SharedDependRelationId)) + return REQ_DATA; + /* These items are treated specially */ if (strcmp(te->desc, "ENCODING") == 0 || strcmp(te->desc, "STDSTRINGS") == 0 || diff --git a/src/bin/pg_dump/pg_backup_archiver.h b/src/bin/pg_dump/pg_backup_archiver.h index 365073b3eae45..325b53fc9bd4b 100644 --- a/src/bin/pg_dump/pg_backup_archiver.h +++ b/src/bin/pg_dump/pg_backup_archiver.h @@ -394,7 +394,6 @@ struct _tocEntry extern int parallel_restore(ArchiveHandle *AH, TocEntry *te); extern void on_exit_close_archive(Archive *AHX); -extern void replace_on_exit_close_archive(Archive *AHX); extern void warn_or_exit_horribly(ArchiveHandle *AH, const char *fmt,...) pg_attribute_printf(2, 3); diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c index d94d0de2a5d17..b5ba3b46dd999 100644 --- a/src/bin/pg_dump/pg_backup_tar.c +++ b/src/bin/pg_dump/pg_backup_tar.c @@ -826,7 +826,7 @@ _CloseArchive(ArchiveHandle *AH) savVerbose = AH->public.verbose; AH->public.verbose = 0; - RestoreArchive((Archive *) AH, false); + RestoreArchive((Archive *) AH); SetArchiveOptions((Archive *) AH, savDopt, savRopt); diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 1937997ea674d..273117c977c52 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -47,10 +47,13 @@ #include "catalog/pg_authid_d.h" #include "catalog/pg_cast_d.h" #include "catalog/pg_class_d.h" +#include "catalog/pg_constraint_d.h" #include "catalog/pg_default_acl_d.h" #include "catalog/pg_largeobject_d.h" +#include "catalog/pg_largeobject_metadata_d.h" #include "catalog/pg_proc_d.h" #include "catalog/pg_publication_d.h" +#include "catalog/pg_shdepend_d.h" #include "catalog/pg_subscription_d.h" #include "catalog/pg_type_d.h" #include "common/connect.h" @@ -209,6 +212,12 @@ static int nbinaryUpgradeClassOids = 0; static SequenceItem *sequences = NULL; static int nsequences = 0; +/* + * For binary upgrade, the dump ID of pg_largeobject_metadata is saved for use + * as a dependency for pg_shdepend and any large object comments/seclabels. + */ +static DumpId lo_metadata_dumpId; + /* Maximum number of relations to fetch in a fetchAttributeStats() call. */ #define MAX_ATTR_STATS_RELS 64 @@ -1085,6 +1094,36 @@ main(int argc, char **argv) if (!dopt.dumpData && dopt.sequence_data) getTableData(&dopt, tblinfo, numTables, RELKIND_SEQUENCE); + /* + * For binary upgrade mode, dump pg_largeobject_metadata and the + * associated pg_shdepend rows. This is faster to restore than the + * equivalent set of large object commands. We can only do this for + * upgrades from v12 and newer; in older versions, pg_largeobject_metadata + * was created WITH OIDS, so the OID column is hidden and won't be dumped. + */ + if (dopt.binary_upgrade && fout->remoteVersion >= 120000) + { + TableInfo *lo_metadata = findTableByOid(LargeObjectMetadataRelationId); + TableInfo *shdepend = findTableByOid(SharedDependRelationId); + + makeTableDataInfo(&dopt, lo_metadata); + makeTableDataInfo(&dopt, shdepend); + + /* + * Save pg_largeobject_metadata's dump ID for use as a dependency for + * pg_shdepend and any large object comments/seclabels. + */ + lo_metadata_dumpId = lo_metadata->dataObj->dobj.dumpId; + addObjectDependency(&shdepend->dataObj->dobj, lo_metadata_dumpId); + + /* + * Only dump large object shdepend rows for this database. + */ + shdepend->dataObj->filtercond = "WHERE classid = 'pg_largeobject'::regclass " + "AND dbid = (SELECT oid FROM pg_database " + " WHERE datname = current_database())"; + } + /* * In binary-upgrade mode, we do not have to worry about the actual LO * data or the associated metadata that resides in the pg_largeobject and @@ -1226,7 +1265,7 @@ main(int argc, char **argv) * right now. */ if (plainText) - RestoreArchive(fout, false); + RestoreArchive(fout); CloseArchive(fout); @@ -2168,6 +2207,13 @@ selectDumpableProcLang(ProcLangInfo *plang, Archive *fout) static void selectDumpableAccessMethod(AccessMethodInfo *method, Archive *fout) { + /* see getAccessMethods() comment about v9.6. */ + if (fout->remoteVersion < 90600) + { + method->dobj.dump = DUMP_COMPONENT_NONE; + return; + } + if (checkExtensionMembership(&method->dobj, fout)) return; /* extension membership overrides all else */ @@ -3924,10 +3970,37 @@ getLOs(Archive *fout) * as it will be copied by pg_upgrade, which simply copies the * pg_largeobject table. We *do* however dump out anything but the * data, as pg_upgrade copies just pg_largeobject, but not - * pg_largeobject_metadata, after the dump is restored. + * pg_largeobject_metadata, after the dump is restored. In versions + * before v12, this is done via proper large object commands. In + * newer versions, we dump the content of pg_largeobject_metadata and + * any associated pg_shdepend rows, which is faster to restore. (On + *