Skip to content

Commit b4af1c2

Browse files
committed
Fix SPGiST vacuum algorithm to handle concurrent tuple motion properly.
A leaf tuple that we need to delete could get moved as a consequence of an insertion happening concurrently with the VACUUM scan. If it moves from a page past the current scan point to a page before, we'll miss it, which is not acceptable. Hence, when we see a leaf-page REDIRECT that could have been made since our scan started, chase down the redirection pointer much as if we were doing a normal index search, and be sure to vacuum every page it leads to. This fixes the issue because, if the tuple was on page N at the instant we start our scan, we will surely find it as a consequence of chasing the redirect from page N, no matter how much it moves around in between. Problem noted by Takashi Yamamoto.
1 parent bad250f commit b4af1c2

File tree

2 files changed

+242
-10
lines changed

2 files changed

+242
-10
lines changed

src/backend/access/spgist/README

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,27 @@ the reverse map of the nextOffset links (ie, when we see tuple x links to
314314
tuple y, we set predecessor[y] = x). Then head tuples are the ones with
315315
no predecessor.
316316

317+
Because insertions can occur while VACUUM runs, a pure sequential scan
318+
could miss deleting some target leaf tuples, because they could get moved
319+
from a not-yet-visited leaf page to an already-visited leaf page as a
320+
consequence of a PickSplit or MoveLeafs operation. Failing to delete any
321+
target TID is not acceptable, so we have to extend the algorithm to cope
322+
with such cases. We recognize that such a move might have occurred when
323+
we see a leaf-page REDIRECT tuple whose XID indicates it might have been
324+
created after the VACUUM scan started. We add the redirection target TID
325+
to a "pending list" of places we need to recheck. Between pages of the
326+
main sequential scan, we empty the pending list by visiting each listed
327+
TID. If it points to an inner tuple (from a PickSplit), add each downlink
328+
TID to the pending list. If it points to a leaf page, vacuum that page.
329+
(We could just vacuum the single pointed-to chain, but vacuuming the
330+
whole page simplifies the code and reduces the odds of VACUUM having to
331+
modify the same page multiple times.) To ensure that pending-list
332+
processing can never get into an endless loop, even in the face of
333+
concurrent index changes, we don't remove list entries immediately but
334+
only after we've completed all pending-list processing; instead we just
335+
mark items as done after processing them. Adding a TID that's already in
336+
the list is a no-op, whether or not that item is marked done yet.
337+
317338
spgbulkdelete also updates the index's free space map.
318339

319340
Currently, spgvacuumcleanup has nothing to do if spgbulkdelete was

src/backend/access/spgist/spgvacuum.c

Lines changed: 221 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,32 +25,106 @@
2525
#include "storage/indexfsm.h"
2626
#include "storage/lmgr.h"
2727
#include "storage/procarray.h"
28+
#include "utils/snapmgr.h"
2829

2930

30-
/* local state for vacuum operations */
31+
/* Entry in pending-list of TIDs we need to revisit */
32+
typedef struct spgVacPendingItem
33+
{
34+
ItemPointerData tid; /* redirection target to visit */
35+
bool done; /* have we dealt with this? */
36+
struct spgVacPendingItem *next; /* list link */
37+
} spgVacPendingItem;
38+
39+
/* Local state for vacuum operations */
3140
typedef struct spgBulkDeleteState
3241
{
3342
/* Parameters passed in to spgvacuumscan */
3443
IndexVacuumInfo *info;
3544
IndexBulkDeleteResult *stats;
3645
IndexBulkDeleteCallback callback;
3746
void *callback_state;
47+
3848
/* Additional working state */
39-
SpGistState spgstate;
40-
TransactionId OldestXmin;
41-
BlockNumber lastFilledBlock;
49+
SpGistState spgstate; /* for SPGiST operations that need one */
50+
spgVacPendingItem *pendingList; /* TIDs we need to (re)visit */
51+
TransactionId myXmin; /* for detecting newly-added redirects */
52+
TransactionId OldestXmin; /* for deciding a redirect is obsolete */
53+
BlockNumber lastFilledBlock; /* last non-deletable block */
4254
} spgBulkDeleteState;
4355

4456

57+
/*
58+
* Add TID to pendingList, but only if not already present.
59+
*
60+
* Note that new items are always appended at the end of the list; this
61+
* ensures that scans of the list don't miss items added during the scan.
62+
*/
63+
static void
64+
spgAddPendingTID(spgBulkDeleteState *bds, ItemPointer tid)
65+
{
66+
spgVacPendingItem *pitem;
67+
spgVacPendingItem **listLink;
68+
69+
/* search the list for pre-existing entry */
70+
listLink = &bds->pendingList;
71+
while (*listLink != NULL)
72+
{
73+
pitem = *listLink;
74+
if (ItemPointerEquals(tid, &pitem->tid))
75+
return; /* already in list, do nothing */
76+
listLink = &pitem->next;
77+
}
78+
/* not there, so append new entry */
79+
pitem = (spgVacPendingItem *) palloc(sizeof(spgVacPendingItem));
80+
pitem->tid = *tid;
81+
pitem->done = false;
82+
pitem->next = NULL;
83+
*listLink = pitem;
84+
}
85+
86+
/*
87+
* Clear pendingList
88+
*/
89+
static void
90+
spgClearPendingList(spgBulkDeleteState *bds)
91+
{
92+
spgVacPendingItem *pitem;
93+
spgVacPendingItem *nitem;
94+
95+
for (pitem = bds->pendingList; pitem != NULL; pitem = nitem)
96+
{
97+
nitem = pitem->next;
98+
/* All items in list should have been dealt with */
99+
Assert(pitem->done);
100+
pfree(pitem);
101+
}
102+
bds->pendingList = NULL;
103+
}
104+
45105
/*
46106
* Vacuum a regular (non-root) leaf page
47107
*
48108
* We must delete tuples that are targeted for deletion by the VACUUM,
49109
* but not move any tuples that are referenced by outside links; we assume
50110
* those are the ones that are heads of chains.
111+
*
112+
* If we find a REDIRECT that was made by a concurrently-running transaction,
113+
* we must add its target TID to pendingList. (We don't try to visit the
114+
* target immediately, first because we don't want VACUUM locking more than
115+
* one buffer at a time, and second because the duplicate-filtering logic
116+
* in spgAddPendingTID is useful to ensure we can't get caught in an infinite
117+
* loop in the face of continuous concurrent insertions.)
118+
*
119+
* If forPending is true, we are examining the page as a consequence of
120+
* chasing a redirect link, not as part of the normal sequential scan.
121+
* We still vacuum the page normally, but we don't increment the stats
122+
* about live tuples; else we'd double-count those tuples, since the page
123+
* has been or will be visited in the sequential scan as well.
51124
*/
52125
static void
53-
vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer)
126+
vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer,
127+
bool forPending)
54128
{
55129
Page page = BufferGetPage(buffer);
56130
spgxlogVacuumLeaf xlrec;
@@ -90,7 +164,8 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer)
90164
}
91165
else
92166
{
93-
bds->stats->num_index_tuples += 1;
167+
if (!forPending)
168+
bds->stats->num_index_tuples += 1;
94169
}
95170

96171
/* Form predecessor map, too */
@@ -106,6 +181,25 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer)
106181
predecessor[lt->nextOffset] = i;
107182
}
108183
}
184+
else if (lt->tupstate == SPGIST_REDIRECT)
185+
{
186+
SpGistDeadTuple dt = (SpGistDeadTuple) lt;
187+
188+
Assert(dt->nextOffset == InvalidOffsetNumber);
189+
Assert(ItemPointerIsValid(&dt->pointer));
190+
191+
/*
192+
* Add target TID to pending list if the redirection could have
193+
* happened since VACUUM started.
194+
*
195+
* Note: we could make a tighter test by seeing if the xid is
196+
* "running" according to the active snapshot; but tqual.c doesn't
197+
* currently export a suitable API, and it's not entirely clear
198+
* that a tighter test is worth the cycles anyway.
199+
*/
200+
if (TransactionIdFollowsOrEquals(dt->xid, bds->myXmin))
201+
spgAddPendingTID(bds, &dt->pointer);
202+
}
109203
else
110204
{
111205
Assert(lt->nextOffset == InvalidOffsetNumber);
@@ -545,7 +639,7 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
545639
}
546640
else
547641
{
548-
vacuumLeafPage(bds, index, buffer);
642+
vacuumLeafPage(bds, index, buffer, false);
549643
vacuumRedirectAndPlaceholder(index, buffer, bds->OldestXmin);
550644
}
551645
}
@@ -556,8 +650,8 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
556650
}
557651

558652
/*
559-
* The root page must never be deleted, nor marked as available in FSM,
560-
* because we don't want it ever returned by a search for a place to
653+
* The root pages must never be deleted, nor marked as available in FSM,
654+
* because we don't want them ever returned by a search for a place to
561655
* put a new tuple. Otherwise, check for empty/deletable page, and
562656
* make sure FSM knows about it.
563657
*/
@@ -585,6 +679,118 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
585679
UnlockReleaseBuffer(buffer);
586680
}
587681

682+
/*
683+
* Process the pending-TID list between pages of the main scan
684+
*/
685+
static void
686+
spgprocesspending(spgBulkDeleteState *bds)
687+
{
688+
Relation index = bds->info->index;
689+
spgVacPendingItem *pitem;
690+
spgVacPendingItem *nitem;
691+
BlockNumber blkno;
692+
Buffer buffer;
693+
Page page;
694+
695+
for (pitem = bds->pendingList; pitem != NULL; pitem = pitem->next)
696+
{
697+
if (pitem->done)
698+
continue; /* ignore already-done items */
699+
700+
/* call vacuum_delay_point while not holding any buffer lock */
701+
vacuum_delay_point();
702+
703+
/* examine the referenced page */
704+
blkno = ItemPointerGetBlockNumber(&pitem->tid);
705+
buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
706+
RBM_NORMAL, bds->info->strategy);
707+
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
708+
page = (Page) BufferGetPage(buffer);
709+
710+
if (PageIsNew(page) || SpGistPageIsDeleted(page))
711+
{
712+
/* Probably shouldn't happen, but ignore it */
713+
}
714+
else if (SpGistPageIsLeaf(page))
715+
{
716+
if (SpGistBlockIsRoot(blkno))
717+
{
718+
/* this should definitely not happen */
719+
elog(ERROR, "redirection leads to root page of index \"%s\"",
720+
RelationGetRelationName(index));
721+
}
722+
723+
/* deal with any deletable tuples */
724+
vacuumLeafPage(bds, index, buffer, true);
725+
/* might as well do this while we are here */
726+
vacuumRedirectAndPlaceholder(index, buffer, bds->OldestXmin);
727+
728+
SpGistSetLastUsedPage(index, buffer);
729+
730+
/*
731+
* We can mark as done not only this item, but any later ones
732+
* pointing at the same page, since we vacuumed the whole page.
733+
*/
734+
pitem->done = true;
735+
for (nitem = pitem->next; nitem != NULL; nitem = nitem->next)
736+
{
737+
if (ItemPointerGetBlockNumber(&nitem->tid) == blkno)
738+
nitem->done = true;
739+
}
740+
}
741+
else
742+
{
743+
/*
744+
* On an inner page, visit the referenced inner tuple and add
745+
* all its downlinks to the pending list. We might have pending
746+
* items for more than one inner tuple on the same page (in fact
747+
* this is pretty likely given the way space allocation works),
748+
* so get them all while we are here.
749+
*/
750+
for (nitem = pitem; nitem != NULL; nitem = nitem->next)
751+
{
752+
if (nitem->done)
753+
continue;
754+
if (ItemPointerGetBlockNumber(&nitem->tid) == blkno)
755+
{
756+
OffsetNumber offset;
757+
SpGistInnerTuple innerTuple;
758+
759+
offset = ItemPointerGetOffsetNumber(&nitem->tid);
760+
innerTuple = (SpGistInnerTuple) PageGetItem(page,
761+
PageGetItemId(page, offset));
762+
if (innerTuple->tupstate == SPGIST_LIVE)
763+
{
764+
SpGistNodeTuple node;
765+
int i;
766+
767+
SGITITERATE(innerTuple, i, node)
768+
{
769+
if (ItemPointerIsValid(&node->t_tid))
770+
spgAddPendingTID(bds, &node->t_tid);
771+
}
772+
}
773+
else if (innerTuple->tupstate == SPGIST_REDIRECT)
774+
{
775+
/* transfer attention to redirect point */
776+
spgAddPendingTID(bds,
777+
&((SpGistDeadTuple) innerTuple)->pointer);
778+
}
779+
else
780+
elog(ERROR, "unexpected SPGiST tuple state: %d",
781+
innerTuple->tupstate);
782+
783+
nitem->done = true;
784+
}
785+
}
786+
}
787+
788+
UnlockReleaseBuffer(buffer);
789+
}
790+
791+
spgClearPendingList(bds);
792+
}
793+
588794
/*
589795
* Perform a bulkdelete scan
590796
*/
@@ -598,6 +804,8 @@ spgvacuumscan(spgBulkDeleteState *bds)
598804

599805
/* Finish setting up spgBulkDeleteState */
600806
initSpGistState(&bds->spgstate, index);
807+
bds->pendingList = NULL;
808+
bds->myXmin = GetActiveSnapshot()->xmin;
601809
bds->OldestXmin = GetOldestXmin(true, false);
602810
bds->lastFilledBlock = SPGIST_LAST_FIXED_BLKNO;
603811

@@ -637,6 +845,9 @@ spgvacuumscan(spgBulkDeleteState *bds)
637845
for (; blkno < num_pages; blkno++)
638846
{
639847
spgvacuumpage(bds, blkno);
848+
/* empty the pending-list after each page */
849+
if (bds->pendingList != NULL)
850+
spgprocesspending(bds);
640851
}
641852
}
642853

@@ -747,7 +958,7 @@ spgvacuumcleanup(PG_FUNCTION_ARGS)
747958
IndexFreeSpaceMapVacuum(index);
748959

749960
/*
750-
* It's quite possible for us to be fooled by concurrent page splits into
961+
* It's quite possible for us to be fooled by concurrent tuple moves into
751962
* double-counting some index tuples, so disbelieve any total that exceeds
752963
* the underlying heap's count ... if we know that accurately. Otherwise
753964
* this might just make matters worse.

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy