Ensure proper alignment of tuples in HashMemoryChunkData buffers.

tglsfdc · tglsfdc · commit 5dc692f78d3b · 2018-01-02T21:23:06.000-05:00
The previous coding relied (without any documentation) on the data[] member of HashMemoryChunkData being at a MAXALIGN'ed offset. If it was not, the tuples would not be maxaligned either, leading to failures on alignment-picky machines. While there seems to be no live bug on any platform we support, this is clearly pretty fragile: any addition to or rearrangement of the fields in HashMemoryChunkData could break it. Let's remove the hazard by getting rid of the data[] member and instead using pointer arithmetic with an explicitly maxalign'ed offset. Discussion: https://postgr.es/m/14483.1514938129@sss.pgh.pa.us
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
@@ -979,7 +979,7 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 		/* process all tuples stored in this chunk (and then free it) */
 		while (idx < oldchunks->used)
 		{
-			HashJoinTuple hashTuple = (HashJoinTuple) (oldchunks->data + idx);
+			HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(oldchunks) + idx);
 			MinimalTuple tuple = HJTUPLE_MINTUPLE(hashTuple);
 			int			hashTupleSize = (HJTUPLE_OVERHEAD + tuple->t_len);
 			int			bucketno;
@@ -1285,7 +1285,7 @@ ExecParallelHashRepartitionFirst(HashJoinTable hashtable)
 		/* Repartition all tuples in this chunk. */
 		while (idx < chunk->used)
 		{
-			HashJoinTuple hashTuple = (HashJoinTuple) (chunk->data + idx);
+			HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx);
 			MinimalTuple tuple = HJTUPLE_MINTUPLE(hashTuple);
 			HashJoinTuple copyTuple;
 			dsa_pointer shared;
@@ -1469,7 +1469,7 @@ ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
 
 		while (idx < chunk->used)
 		{
-			HashJoinTuple hashTuple = (HashJoinTuple) (chunk->data + idx);
+			HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx);
 			int			bucketno;
 			int			batchno;
 
@@ -1552,7 +1552,7 @@ ExecParallelHashIncreaseNumBuckets(HashJoinTable hashtable)
 
 				while (idx < chunk->used)
 				{
-					HashJoinTuple hashTuple = (HashJoinTuple) (chunk->data + idx);
+					HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx);
 					dsa_pointer shared = chunk_s + HASH_CHUNK_HEADER_SIZE + idx;
 					int			bucketno;
 					int			batchno;
@@ -2651,17 +2651,16 @@ dense_alloc(HashJoinTable hashtable, Size size)
 	size = MAXALIGN(size);
 
 	/*
-	 * If tuple size is larger than of 1/4 of chunk size, allocate a separate
-	 * chunk.
+	 * If tuple size is larger than threshold, allocate a separate chunk.
 	 */
 	if (size > HASH_CHUNK_THRESHOLD)
 	{
 		/* allocate new chunk and put it at the beginning of the list */
 		newChunk = (HashMemoryChunk) MemoryContextAlloc(hashtable->batchCxt,
-														offsetof(HashMemoryChunkData, data) + size);
+														HASH_CHUNK_HEADER_SIZE + size);
 		newChunk->maxlen = size;
-		newChunk->used = 0;
-		newChunk->ntuples = 0;
+		newChunk->used = size;
+		newChunk->ntuples = 1;
 
 		/*
 		 * Add this chunk to the list after the first existing chunk, so that
@@ -2678,10 +2677,7 @@ dense_alloc(HashJoinTable hashtable, Size size)
 			hashtable->chunks = newChunk;
 		}
 
-		newChunk->used += size;
-		newChunk->ntuples += 1;
-
-		return newChunk->data;
+		return HASH_CHUNK_DATA(newChunk);
 	}
 
 	/*
@@ -2693,7 +2689,7 @@ dense_alloc(HashJoinTable hashtable, Size size)
 	{
 		/* allocate new chunk and put it at the beginning of the list */
 		newChunk = (HashMemoryChunk) MemoryContextAlloc(hashtable->batchCxt,
-														offsetof(HashMemoryChunkData, data) + HASH_CHUNK_SIZE);
+														HASH_CHUNK_HEADER_SIZE + HASH_CHUNK_SIZE);
 
 		newChunk->maxlen = HASH_CHUNK_SIZE;
 		newChunk->used = size;
@@ -2702,11 +2698,11 @@ dense_alloc(HashJoinTable hashtable, Size size)
 		newChunk->next.unshared = hashtable->chunks;
 		hashtable->chunks = newChunk;
 
-		return newChunk->data;
+		return HASH_CHUNK_DATA(newChunk);
 	}
 
 	/* There is enough space in the current chunk, let's add the tuple */
-	ptr = hashtable->chunks->data + hashtable->chunks->used;
+	ptr = HASH_CHUNK_DATA(hashtable->chunks) + hashtable->chunks->used;
 	hashtable->chunks->used += size;
 	hashtable->chunks->ntuples += 1;
 
@@ -2751,7 +2747,7 @@ ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size,
 		chunk_shared = hashtable->current_chunk_shared;
 		Assert(chunk == dsa_get_address(hashtable->area, chunk_shared));
 		*shared = chunk_shared + HASH_CHUNK_HEADER_SIZE + chunk->used;
-		result = (HashJoinTuple) (chunk->data + chunk->used);
+		result = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + chunk->used);
 		chunk->used += size;
 
 		Assert(chunk->used <= chunk->maxlen);
@@ -2859,8 +2855,8 @@ ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size,
 	}
 	LWLockRelease(&pstate->lock);
 
-	Assert(chunk->data == dsa_get_address(hashtable->area, *shared));
-	result = (HashJoinTuple) chunk->data;
+	Assert(HASH_CHUNK_DATA(chunk) == dsa_get_address(hashtable->area, *shared));
+	result = (HashJoinTuple) HASH_CHUNK_DATA(chunk);
 
 	return result;
 }
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
@@ -117,7 +117,7 @@ typedef struct HashSkewBucket
 typedef struct HashMemoryChunkData
 {
 	int			ntuples;		/* number of tuples stored in this chunk */
-	size_t		maxlen;			/* size of the buffer holding the tuples */
+	size_t		maxlen;			/* size of the chunk's tuple buffer */
 	size_t		used;			/* number of buffer bytes already used */
 
 	/* pointer to the next chunk (linked list) */
@@ -127,13 +127,19 @@ typedef struct HashMemoryChunkData
 		dsa_pointer shared;
 	}			next;
 
-	char		data[FLEXIBLE_ARRAY_MEMBER];	/* buffer allocated at the end */
+	/*
+	 * The chunk's tuple buffer starts after the HashMemoryChunkData struct,
+	 * at offset HASH_CHUNK_HEADER_SIZE (which must be maxaligned).  Note that
+	 * that offset is not included in "maxlen" or "used".
+	 */
 }			HashMemoryChunkData;
 
 typedef struct HashMemoryChunkData *HashMemoryChunk;
 
 #define HASH_CHUNK_SIZE			(32 * 1024L)
-#define HASH_CHUNK_HEADER_SIZE (offsetof(HashMemoryChunkData, data))
+#define HASH_CHUNK_HEADER_SIZE	MAXALIGN(sizeof(HashMemoryChunkData))
+#define HASH_CHUNK_DATA(hc)		(((char *) (hc)) + HASH_CHUNK_HEADER_SIZE)
+/* tuples exceeding HASH_CHUNK_THRESHOLD bytes are put in their own chunk */
 #define HASH_CHUNK_THRESHOLD	(HASH_CHUNK_SIZE / 4)
 
 /*