Skip to content

Commit 569174f

Browse files
committed
btree: Support parallel index scans.
This isn't exposed to the optimizer or the executor yet; we'll add support for those things in a separate patch. But this puts the basic mechanism in place: several processes can attach to a parallel btree index scan, and each one will get a subset of the tuples that would have been produced by a non-parallel scan. Each index page becomes the responsibility of a single worker, which then returns all of the TIDs on that page. Rahila Syed, Amit Kapila, Robert Haas, reviewed and tested by Anastasia Lubennikova, Tushar Ahuja, and Haribabu Kommi.
1 parent 8569955 commit 569174f

File tree

8 files changed

+527
-50
lines changed

8 files changed

+527
-50
lines changed

doc/src/sgml/monitoring.sgml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1207,14 +1207,18 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
12071207
<entry>Waiting in an extension.</entry>
12081208
</row>
12091209
<row>
1210-
<entry morerows="9"><literal>IPC</></entry>
1210+
<entry morerows="10"><literal>IPC</></entry>
12111211
<entry><literal>BgWorkerShutdown</></entry>
12121212
<entry>Waiting for background worker to shut down.</entry>
12131213
</row>
12141214
<row>
12151215
<entry><literal>BgWorkerStartup</></entry>
12161216
<entry>Waiting for background worker to start up.</entry>
12171217
</row>
1218+
<row>
1219+
<entry><literal>BtreePage</></entry>
1220+
<entry>Waiting for the page number needed to continue a parallel btree scan to become available.</entry>
1221+
</row>
12181222
<row>
12191223
<entry><literal>ExecuteGather</></entry>
12201224
<entry>Waiting for activity from child process when executing <literal>Gather</> node.</entry>

src/backend/access/nbtree/nbtree.c

Lines changed: 256 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
#include "access/xlog.h"
2424
#include "catalog/index.h"
2525
#include "commands/vacuum.h"
26+
#include "pgstat.h"
27+
#include "storage/condition_variable.h"
2628
#include "storage/indexfsm.h"
2729
#include "storage/ipc.h"
2830
#include "storage/lmgr.h"
@@ -63,6 +65,45 @@ typedef struct
6365
MemoryContext pagedelcontext;
6466
} BTVacState;
6567

68+
/*
69+
* BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
70+
*
71+
* BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
72+
* a new page; others must wait.
73+
*
74+
* BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
75+
* to a new page; some process can start doing that.
76+
*
77+
* BTPARALLEL_DONE indicates that the scan is complete (including error exit).
78+
* We reach this state once for every distinct combination of array keys.
79+
*/
80+
typedef enum
81+
{
82+
BTPARALLEL_NOT_INITIALIZED,
83+
BTPARALLEL_ADVANCING,
84+
BTPARALLEL_IDLE,
85+
BTPARALLEL_DONE
86+
} BTPS_State;
87+
88+
/*
89+
* BTParallelScanDescData contains btree specific shared information required
90+
* for parallel scan.
91+
*/
92+
typedef struct BTParallelScanDescData
93+
{
94+
BlockNumber btps_scanPage; /* latest or next page to be scanned */
95+
BTPS_State btps_pageStatus;/* indicates whether next page is available
96+
* for scan. see above for possible states of
97+
* parallel scan. */
98+
int btps_arrayKeyCount; /* count indicating number of array
99+
* scan keys processed by parallel
100+
* scan */
101+
slock_t btps_mutex; /* protects above variables */
102+
ConditionVariable btps_cv; /* used to synchronize parallel scan */
103+
} BTParallelScanDescData;
104+
105+
typedef struct BTParallelScanDescData *BTParallelScanDesc;
106+
66107

67108
static void btbuildCallback(Relation index,
68109
HeapTuple htup,
@@ -118,9 +159,9 @@ bthandler(PG_FUNCTION_ARGS)
118159
amroutine->amendscan = btendscan;
119160
amroutine->ammarkpos = btmarkpos;
120161
amroutine->amrestrpos = btrestrpos;
121-
amroutine->amestimateparallelscan = NULL;
122-
amroutine->aminitparallelscan = NULL;
123-
amroutine->amparallelrescan = NULL;
162+
amroutine->amestimateparallelscan = btestimateparallelscan;
163+
amroutine->aminitparallelscan = btinitparallelscan;
164+
amroutine->amparallelrescan = btparallelrescan;
124165

125166
PG_RETURN_POINTER(amroutine);
126167
}
@@ -491,6 +532,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
491532
}
492533

493534
so->markItemIndex = -1;
535+
so->arrayKeyCount = 0;
494536
BTScanPosUnpinIfPinned(so->markPos);
495537
BTScanPosInvalidate(so->markPos);
496538

@@ -652,6 +694,217 @@ btrestrpos(IndexScanDesc scan)
652694
}
653695
}
654696

697+
/*
698+
* btestimateparallelscan -- estimate storage for BTParallelScanDescData
699+
*/
700+
Size
701+
btestimateparallelscan(void)
702+
{
703+
return sizeof(BTParallelScanDescData);
704+
}
705+
706+
/*
707+
* btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
708+
*/
709+
void
710+
btinitparallelscan(void *target)
711+
{
712+
BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
713+
714+
SpinLockInit(&bt_target->btps_mutex);
715+
bt_target->btps_scanPage = InvalidBlockNumber;
716+
bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
717+
bt_target->btps_arrayKeyCount = 0;
718+
ConditionVariableInit(&bt_target->btps_cv);
719+
}
720+
721+
/*
722+
* btparallelrescan() -- reset parallel scan
723+
*/
724+
void
725+
btparallelrescan(IndexScanDesc scan)
726+
{
727+
BTParallelScanDesc btscan;
728+
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
729+
730+
Assert(parallel_scan);
731+
732+
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
733+
parallel_scan->ps_offset);
734+
735+
/*
736+
* In theory, we don't need to acquire the spinlock here, because there
737+
* shouldn't be any other workers running at this point, but we do so for
738+
* consistency.
739+
*/
740+
SpinLockAcquire(&btscan->btps_mutex);
741+
btscan->btps_scanPage = InvalidBlockNumber;
742+
btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
743+
btscan->btps_arrayKeyCount = 0;
744+
SpinLockRelease(&btscan->btps_mutex);
745+
}
746+
747+
/*
748+
* _bt_parallel_seize() -- Begin the process of advancing the scan to a new
749+
* page. Other scans must wait until we call bt_parallel_release() or
750+
* bt_parallel_done().
751+
*
752+
* The return value is true if we successfully seized the scan and false
753+
* if we did not. The latter case occurs if no pages remain for the current
754+
* set of scankeys.
755+
*
756+
* If the return value is true, *pageno returns the next or current page
757+
* of the scan (depending on the scan direction). An invalid block number
758+
* means the scan hasn't yet started, and P_NONE means we've reached the end.
759+
* The first time a participating process reaches the last page, it will return
760+
* true and set *pageno to P_NONE; after that, further attempts to seize the
761+
* scan will return false.
762+
*
763+
* Callers should ignore the value of pageno if the return value is false.
764+
*/
765+
bool
766+
_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
767+
{
768+
BTScanOpaque so = (BTScanOpaque) scan->opaque;
769+
BTPS_State pageStatus;
770+
bool exit_loop = false;
771+
bool status = true;
772+
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
773+
BTParallelScanDesc btscan;
774+
775+
*pageno = P_NONE;
776+
777+
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
778+
parallel_scan->ps_offset);
779+
780+
while (1)
781+
{
782+
SpinLockAcquire(&btscan->btps_mutex);
783+
pageStatus = btscan->btps_pageStatus;
784+
785+
if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
786+
{
787+
/* Parallel scan has already advanced to a new set of scankeys. */
788+
status = false;
789+
}
790+
else if (pageStatus == BTPARALLEL_DONE)
791+
{
792+
/*
793+
* We're done with this set of scankeys. This may be the end, or
794+
* there could be more sets to try.
795+
*/
796+
status = false;
797+
}
798+
else if (pageStatus != BTPARALLEL_ADVANCING)
799+
{
800+
/*
801+
* We have successfully seized control of the scan for the purpose
802+
* of advancing it to a new page!
803+
*/
804+
btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
805+
*pageno = btscan->btps_scanPage;
806+
exit_loop = true;
807+
}
808+
SpinLockRelease(&btscan->btps_mutex);
809+
if (exit_loop || !status)
810+
break;
811+
ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
812+
}
813+
ConditionVariableCancelSleep();
814+
815+
return status;
816+
}
817+
818+
/*
819+
* _bt_parallel_release() -- Complete the process of advancing the scan to a
820+
* new page. We now have the new value btps_scanPage; some other backend
821+
* can now begin advancing the scan.
822+
*/
823+
void
824+
_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
825+
{
826+
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
827+
BTParallelScanDesc btscan;
828+
829+
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
830+
parallel_scan->ps_offset);
831+
832+
SpinLockAcquire(&btscan->btps_mutex);
833+
btscan->btps_scanPage = scan_page;
834+
btscan->btps_pageStatus = BTPARALLEL_IDLE;
835+
SpinLockRelease(&btscan->btps_mutex);
836+
ConditionVariableSignal(&btscan->btps_cv);
837+
}
838+
839+
/*
840+
* _bt_parallel_done() -- Mark the parallel scan as complete.
841+
*
842+
* When there are no pages left to scan, this function should be called to
843+
* notify other workers. Otherwise, they might wait forever for the scan to
844+
* advance to the next page.
845+
*/
846+
void
847+
_bt_parallel_done(IndexScanDesc scan)
848+
{
849+
BTScanOpaque so = (BTScanOpaque) scan->opaque;
850+
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
851+
BTParallelScanDesc btscan;
852+
bool status_changed = false;
853+
854+
/* Do nothing, for non-parallel scans */
855+
if (parallel_scan == NULL)
856+
return;
857+
858+
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
859+
parallel_scan->ps_offset);
860+
861+
/*
862+
* Mark the parallel scan as done for this combination of scan keys,
863+
* unless some other process already did so. See also
864+
* _bt_advance_array_keys.
865+
*/
866+
SpinLockAcquire(&btscan->btps_mutex);
867+
if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
868+
btscan->btps_pageStatus != BTPARALLEL_DONE)
869+
{
870+
btscan->btps_pageStatus = BTPARALLEL_DONE;
871+
status_changed = true;
872+
}
873+
SpinLockRelease(&btscan->btps_mutex);
874+
875+
/* wake up all the workers associated with this parallel scan */
876+
if (status_changed)
877+
ConditionVariableBroadcast(&btscan->btps_cv);
878+
}
879+
880+
/*
881+
* _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
882+
* keys.
883+
*
884+
* Updates the count of array keys processed for both local and parallel
885+
* scans.
886+
*/
887+
void
888+
_bt_parallel_advance_array_keys(IndexScanDesc scan)
889+
{
890+
BTScanOpaque so = (BTScanOpaque) scan->opaque;
891+
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
892+
BTParallelScanDesc btscan;
893+
894+
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
895+
parallel_scan->ps_offset);
896+
897+
so->arrayKeyCount++;
898+
SpinLockAcquire(&btscan->btps_mutex);
899+
if (btscan->btps_pageStatus == BTPARALLEL_DONE)
900+
{
901+
btscan->btps_scanPage = InvalidBlockNumber;
902+
btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
903+
btscan->btps_arrayKeyCount++;
904+
}
905+
SpinLockRelease(&btscan->btps_mutex);
906+
}
907+
655908
/*
656909
* Bulk deletion of all index entries pointing to a set of heap tuples.
657910
* The set of target tuples is specified via a callback routine that tells

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy