postgrespro
diff --git a/‎doc/src/sgml/ref/cluster.sgml
Lines changed: 30 additions & 37 deletions b/‎doc/src/sgml/ref/cluster.sgml
Lines changed: 30 additions & 37 deletions
diff --git a/‎src/backend/commands/cluster.c
Lines changed: 121 additions & 40 deletions b/‎src/backend/commands/cluster.c
Lines changed: 121 additions & 40 deletions
@@ -128,18 +128,33 @@ CLUSTER [VERBOSE]
    </para>
 
    <para>
-    During the cluster operation, a temporary copy of the table is created
-    that contains the table data in the index order.  Temporary copies of
-    each index on the table are created as well.  Therefore, you need free
-    space on disk at least equal to the sum of the table size and the index
-    sizes.
+    <command>CLUSTER</> can re-sort the table using either an indexscan
+    on the specified index, or (if the index is a b-tree) a sequential
+    scan followed by sorting.  It will attempt to choose the method that
+    will be faster, based on planner cost parameters and available statistical
+    information.
    </para>
 
    <para>
-    Because <command>CLUSTER</command> remembers the clustering information,
-    one can cluster the tables one wants clustered manually the first time, and
-    setup a timed event similar to <command>VACUUM</command> so that the tables
-    are periodically reclustered.
+    When an indexscan is used, a temporary copy of the table is created that
+    contains the table data in the index order.  Temporary copies of each
+    index on the table are created as well.  Therefore, you need free space on
+    disk at least equal to the sum of the table size and the index sizes.
+   </para>
+
+   <para>
+    When a sequential scan and sort is used, a temporary sort file is
+    also created, so that the peak temporary space requirement is as much
+    as double the table size, plus the index sizes.  This method is often
+    faster than the indexscan method, but if the disk space requirement is
+    intolerable, you can disable this choice by temporarily setting <xref
+    linkend="guc-enable-sort"> to <literal>off</>.
+   </para>
+
+   <para>
+    It is advisable to set <xref linkend="guc-maintenance-work-mem"> to
+    a reasonably large value (but not more than the amount of RAM you can
+    dedicate to the <command>CLUSTER</> operation) before clustering.
    </para>
 
    <para>
@@ -150,35 +165,13 @@ CLUSTER [VERBOSE]
    </para>
 
    <para>
-    There is another way to cluster data. The
-    <command>CLUSTER</command> command reorders the original table by
-    scanning it using the index you specify. This can be slow
-    on large tables because the rows are fetched from the table
-    in index order, and if the table is disordered, the
-    entries are on random pages, so there is one disk page
-    retrieved for every row moved. (<productname>PostgreSQL</productname> has
-    a cache, but the majority of a big table will not fit in the cache.)
-    The other way to cluster a table is to use:
-
-<programlisting>
-CREATE TABLE <replaceable class="parameter">newtable</replaceable> AS
-    SELECT * FROM <replaceable class="parameter">table</replaceable> ORDER BY <replaceable class="parameter">columnlist</replaceable>;
-</programlisting>
-
-    which uses the <productname>PostgreSQL</productname> sorting code
-    to produce the desired order;
-    this is usually much faster than an index scan for disordered data.
-    Then you drop the old table, use
-    <command>ALTER TABLE ... RENAME</command>
-    to rename <replaceable class="parameter">newtable</replaceable> to the
-    old name, and recreate the table's indexes.
-    The big disadvantage of this approach is that it does not preserve
-    OIDs, constraints, foreign key relationships, granted privileges, and
-    other ancillary properties of the table &mdash; all such items must be
-    manually recreated.  Another disadvantage is that this way requires a sort
-    temporary file about the same size as the table itself, so peak disk usage
-    is about three times the table size instead of twice the table size.
+    Because <command>CLUSTER</command> remembers which indexes are clustered,
+    one can cluster the tables one wants clustered manually the first time,
+    then set up a periodic maintenance script that executes
+    <command>CLUSTER</> without any parameters, so that the desired tables
+    are periodically reclustered.
    </para>
+
  </refsect1>
 
  <refsect1>
 
@@ -36,6 +36,7 @@
 #include "commands/trigger.h"
 #include "commands/vacuum.h"
 #include "miscadmin.h"
+#include "optimizer/planner.h"
 #include "storage/bufmgr.h"
 #include "storage/procarray.h"
 #include "storage/smgr.h"
@@ -49,6 +50,7 @@
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
 #include "utils/tqual.h"
+#include "utils/tuplesort.h"
 
 
 /*
@@ -69,7 +71,10 @@ static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 			   int freeze_min_age, int freeze_table_age,
 			   bool *pSwapToastByContent, TransactionId *pFreezeXid);
 static List *get_tables_to_cluster(MemoryContext cluster_context);
-
+static void reform_and_rewrite_tuple(HeapTuple tuple,
+						 TupleDesc oldTupDesc, TupleDesc newTupDesc,
+						 Datum *values, bool *isnull,
+						 bool newRelHasOids, RewriteState rwstate);
 
 
 /*---------------------------------------------------------------------------
@@ -759,6 +764,8 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 	TransactionId OldestXmin;
 	TransactionId FreezeXid;
 	RewriteState rwstate;
+	bool 		 use_sort;
+	Tuplesortstate *tuplesort;
 
 	/*
 	 * Open the relations we need.
@@ -845,12 +852,30 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 	rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid, use_wal);
 
 	/*
-	 * Scan through the OldHeap, either in OldIndex order or sequentially, and
-	 * copy each tuple into the NewHeap.  To ensure we see recently-dead
-	 * tuples that still need to be copied, we scan with SnapshotAny and use
+	 * Decide whether to use an indexscan or seqscan-and-optional-sort to
+	 * scan the OldHeap.  We know how to use a sort to duplicate the ordering
+	 * of a btree index, and will use seqscan-and-sort for that case if the
+	 * planner tells us it's cheaper.  Otherwise, always indexscan if an
+	 * index is provided, else plain seqscan.
+	 */
+	if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
+		use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex);
+	else
+		use_sort = false;
+
+	/* Set up sorting if wanted */
+	if (use_sort)
+		tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
+											maintenance_work_mem, false);
+	else
+		tuplesort = NULL;
+
+	/*
+	 * Prepare to scan the OldHeap.  To ensure we see recently-dead tuples
+	 * that still need to be copied, we scan with SnapshotAny and use
 	 * HeapTupleSatisfiesVacuum for the visibility test.
 	 */
-	if (OldIndex != NULL)
+	if (OldIndex != NULL && !use_sort)
 	{
 		heapScan = NULL;
 		indexScan = index_beginscan(OldHeap, OldIndex,
@@ -862,17 +887,21 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 		indexScan = NULL;
 	}
 
+	/*
+	 * Scan through the OldHeap, either in OldIndex order or sequentially;
+	 * copy each tuple into the NewHeap, or transiently to the tuplesort
+	 * module.  Note that we don't bother sorting dead tuples (they won't
+	 * get to the new table anyway).
+	 */
 	for (;;)
 	{
 		HeapTuple	tuple;
-		HeapTuple	copiedTuple;
 		Buffer		buf;
 		bool		isdead;
-		int			i;
 
 		CHECK_FOR_INTERRUPTS();
 
-		if (OldIndex != NULL)
+		if (indexScan != NULL)
 		{
 			tuple = index_getnext(indexScan, ForwardScanDirection);
 			if (tuple == NULL)
@@ -951,45 +980,50 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 			continue;
 		}
 
-		/*
-		 * We cannot simply copy the tuple as-is, for several reasons:
-		 *
-		 * 1. We'd like to squeeze out the values of any dropped columns, both
-		 * to save space and to ensure we have no corner-case failures. (It's
-		 * possible for example that the new table hasn't got a TOAST table
-		 * and so is unable to store any large values of dropped cols.)
-		 *
-		 * 2. The tuple might not even be legal for the new table; this is
-		 * currently only known to happen as an after-effect of ALTER TABLE
-		 * SET WITHOUT OIDS.
-		 *
-		 * So, we must reconstruct the tuple from component Datums.
-		 */
-		heap_deform_tuple(tuple, oldTupDesc, values, isnull);
+		if (tuplesort != NULL)
+			tuplesort_putheaptuple(tuplesort, tuple);
+		else
+			reform_and_rewrite_tuple(tuple,
+									 oldTupDesc, newTupDesc,
+									 values, isnull,
+									 NewHeap->rd_rel->relhasoids, rwstate);
+	}
 
-		/* Be sure to null out any dropped columns */
-		for (i = 0; i < natts; i++)
+	if (indexScan != NULL)
+		index_endscan(indexScan);
+	if (heapScan != NULL)
+		heap_endscan(heapScan);
+
+	/*
+	 * In scan-and-sort mode, complete the sort, then read out all live
+	 * tuples from the tuplestore and write them to the new relation.
+	 */
+	if (tuplesort != NULL)
+	{
+		tuplesort_performsort(tuplesort);
+
+		for (;;)
 		{
-			if (newTupDesc->attrs[i]->attisdropped)
-				isnull[i] = true;
-		}
+			HeapTuple	tuple;
+			bool		shouldfree;
 
-		copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
+			CHECK_FOR_INTERRUPTS();
 
-		/* Preserve OID, if any */
-		if (NewHeap->rd_rel->relhasoids)
-			HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple));
+			tuple = tuplesort_getheaptuple(tuplesort, true, &shouldfree);
+			if (tuple == NULL)
+				break;
 
-		/* The heap rewrite module does the rest */
-		rewrite_heap_tuple(rwstate, tuple, copiedTuple);
+			reform_and_rewrite_tuple(tuple,
+									 oldTupDesc, newTupDesc,
+									 values, isnull,
+									 NewHeap->rd_rel->relhasoids, rwstate);
 
-		heap_freetuple(copiedTuple);
-	}
+			if (shouldfree)
+				heap_freetuple(tuple);
+		}
 
-	if (OldIndex != NULL)
-		index_endscan(indexScan);
-	else
-		heap_endscan(heapScan);
+		tuplesort_end(tuplesort);
+	}
 
 	/* Write out any remaining tuples, and fsync if needed */
 	end_heap_rewrite(rwstate);
@@ -1488,3 +1522,50 @@ get_tables_to_cluster(MemoryContext cluster_context)
 
 	return rvs;
 }
+
+
+/*
+ * Reconstruct and rewrite the given tuple
+ *
+ * We cannot simply copy the tuple as-is, for several reasons:
+ *
+ * 1. We'd like to squeeze out the values of any dropped columns, both
+ * to save space and to ensure we have no corner-case failures. (It's
+ * possible for example that the new table hasn't got a TOAST table
+ * and so is unable to store any large values of dropped cols.)
+ *
+ * 2. The tuple might not even be legal for the new table; this is
+ * currently only known to happen as an after-effect of ALTER TABLE
+ * SET WITHOUT OIDS.
+ *
+ * So, we must reconstruct the tuple from component Datums.
+ */
+static void
+reform_and_rewrite_tuple(HeapTuple tuple,
+						 TupleDesc oldTupDesc, TupleDesc newTupDesc,
+						 Datum *values, bool *isnull,
+						 bool newRelHasOids, RewriteState rwstate)
+{
+	HeapTuple	copiedTuple;
+	int 		i;
+
+	heap_deform_tuple(tuple, oldTupDesc, values, isnull);
+
+	/* Be sure to null out any dropped columns */
+	for (i = 0; i < newTupDesc->natts; i++)
+	{
+		if (newTupDesc->attrs[i]->attisdropped)
+			isnull[i] = true;
+	}
+
+	copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
+
+	/* Preserve OID, if any */
+	if (newRelHasOids)
+		HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple));
+
+	/* The heap rewrite module does the rest */
+	rewrite_heap_tuple(rwstate, tuple, copiedTuple);
+
+	heap_freetuple(copiedTuple);
+}