From 518cd4794b9c0369453f8267a841876ea157d7f7 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <36882414+akuzm@users.noreply.github.com>
Date: Tue, 20 Aug 2024 13:30:59 +0200
Subject: [PATCH] Null validity bitmap in ArrowArray (#7199)

This is allowed when we have no nulls, simplifies the code a little, and
reduces memory usage.
---
 src/adts/bit_array_impl.h                     |  2 +-
 tsl/src/compression/algorithms/array.c        | 35 +++++++--------
 .../compression/algorithms/deltadelta_impl.c  | 37 ++++++++--------
 tsl/src/compression/algorithms/dictionary.c   | 37 ++++++++--------
 tsl/src/compression/algorithms/gorilla_impl.c | 43 ++++++++++---------
 tsl/src/compression/algorithms/simple8b_rle.h |  2 +-
 .../algorithms/simple8b_rle_bitmap.h          |  6 +--
 tsl/src/compression/arrow_c_data_interface.h  | 10 +++++
 .../nodes/decompress_chunk/compressed_batch.c | 11 ++++-
 .../decompress_chunk/vector_predicates.c      |  5 ++-
 .../decompress_chunk/vector_predicates.h      |  2 +-
 tsl/src/nodes/vector_agg/exec.c               |  9 ++--
 tsl/src/nodes/vector_agg/functions.c          | 28 ++++--------
 13 files changed, 117 insertions(+), 110 deletions(-)

diff --git a/src/adts/bit_array_impl.h b/src/adts/bit_array_impl.h
index 5a8005139c8..dbdb1a98c0c 100644
--- a/src/adts/bit_array_impl.h
+++ b/src/adts/bit_array_impl.h
@@ -348,5 +348,5 @@ static uint64
 bit_array_low_bits_mask(uint8 bits_used)
 {
 	Assert(bits_used > 0);
-	return -1ULL >> (64 - bits_used);
+	return ~0ULL >> (64 - bits_used);
 }
diff --git a/tsl/src/compression/algorithms/array.c b/tsl/src/compression/algorithms/array.c
index ef3216274cc..674131e2339 100644
--- a/tsl/src/compression/algorithms/array.c
+++ b/tsl/src/compression/algorithms/array.c
@@ -575,25 +575,26 @@ text_array_decompress_all_serialized_no_header(StringInfo si, bool has_nulls,
 	}
 	offsets[n_notnull] = offset;
 
-	const int validity_bitmap_bytes = sizeof(uint64) * (pad_to_multiple(64, n_total) / 64);
-	uint64 *restrict validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes);
-
-	/*
-	 * First, mark all data as valid, we will fill the nulls later if needed.
-	 * Note that the validity bitmap size is a multiple of 64 bits. We have to
-	 * fill the tail bits with zeros, because the corresponding elements are not
-	 * valid.
-	 *
-	 */
-	memset(validity_bitmap, 0xFF, validity_bitmap_bytes);
-	if (n_total % 64)
-	{
-		const uint64 tail_mask = -1ULL >> (64 - n_total % 64);
-		validity_bitmap[n_total / 64] &= tail_mask;
-	}
-
+	uint64 *restrict validity_bitmap = NULL;
 	if (has_nulls)
 	{
+		const int validity_bitmap_bytes = sizeof(uint64) * (pad_to_multiple(64, n_total) / 64);
+		validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes);
+
+		/*
+		 * First, mark all data as valid, we will fill the nulls later if needed.
+		 * Note that the validity bitmap size is a multiple of 64 bits. We have to
+		 * fill the tail bits with zeros, because the corresponding elements are not
+		 * valid.
+		 *
+		 */
+		memset(validity_bitmap, 0xFF, validity_bitmap_bytes);
+		if (n_total % 64)
+		{
+			const uint64 tail_mask = ~0ULL >> (64 - n_total % 64);
+			validity_bitmap[n_total / 64] &= tail_mask;
+		}
+
 		/*
 		 * We have decompressed the data with nulls skipped, reshuffle it
 		 * according to the nulls bitmap.
diff --git a/tsl/src/compression/algorithms/deltadelta_impl.c b/tsl/src/compression/algorithms/deltadelta_impl.c
index 2673570134e..6e036a5722b 100644
--- a/tsl/src/compression/algorithms/deltadelta_impl.c
+++ b/tsl/src/compression/algorithms/deltadelta_impl.c
@@ -55,9 +55,6 @@ FUNCTION_NAME(delta_delta_decompress_all, ELEMENT_TYPE)(Datum compressed, Memory
 	Assert(n_total >= n_notnull);
 	Assert(n_total <= GLOBAL_MAX_ROWS_PER_COMPRESSION);
 
-	const int validity_bitmap_bytes = sizeof(uint64) * ((n_total + 64 - 1) / 64);
-	uint64 *restrict validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes);
-
 	/*
 	 * We need additional padding at the end of buffer, because the code that
 	 * converts the elements to postres Datum always reads in 8 bytes.
@@ -91,23 +88,27 @@ FUNCTION_NAME(delta_delta_decompress_all, ELEMENT_TYPE)(Datum compressed, Memory
 	}
 #undef INNER_LOOP_SIZE
 
-	/*
-	 * First, mark all data as valid, we will fill the nulls later if needed.
-	 * Note that the validity bitmap size is a multiple of 64 bits. We have to
-	 * fill the tail bits with zeros, because the corresponding elements are not
-	 * valid.
-	 *
-	 */
-	memset(validity_bitmap, 0xFF, validity_bitmap_bytes);
-	if (n_total % 64)
-	{
-		const uint64 tail_mask = -1ULL >> (64 - n_total % 64);
-		validity_bitmap[n_total / 64] &= tail_mask;
-	}
-
-	/* Now move the data to account for nulls, and fill the validity bitmap. */
+	uint64 *restrict validity_bitmap = NULL;
 	if (has_nulls)
 	{
+		/* Now move the data to account for nulls, and fill the validity bitmap. */
+		const int validity_bitmap_bytes = sizeof(uint64) * ((n_total + 64 - 1) / 64);
+		validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes);
+
+		/*
+		 * First, mark all data as valid, we will fill the nulls later if needed.
+		 * Note that the validity bitmap size is a multiple of 64 bits. We have to
+		 * fill the tail bits with zeros, because the corresponding elements are not
+		 * valid.
+		 *
+		 */
+		memset(validity_bitmap, 0xFF, validity_bitmap_bytes);
+		if (n_total % 64)
+		{
+			const uint64 tail_mask = ~0ULL >> (64 - n_total % 64);
+			validity_bitmap[n_total / 64] &= tail_mask;
+		}
+
 		/*
 		 * The number of not-null elements we have must be consistent with the
 		 * nulls bitmap.
diff --git a/tsl/src/compression/algorithms/dictionary.c b/tsl/src/compression/algorithms/dictionary.c
index c13bc828b49..a090447c933 100644
--- a/tsl/src/compression/algorithms/dictionary.c
+++ b/tsl/src/compression/algorithms/dictionary.c
@@ -458,26 +458,27 @@ tsl_text_dictionary_decompress_all(Datum compressed, Oid element_type, MemoryCon
 		text_array_decompress_all_serialized_no_header(&si, /* has_nulls = */ false, dest_mctx);
 	CheckCompressedData(header->num_distinct == dict->length);
 
-	/* Fill validity and indices of the array elements, reshuffling for nulls if needed. */
-	const int validity_bitmap_bytes = sizeof(uint64) * pad_to_multiple(64, n_total) / 64;
-	uint64 *restrict validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes);
-
-	/*
-	 * First, mark all data as valid, we will fill the nulls later if needed.
-	 * Note that the validity bitmap size is a multiple of 64 bits. We have to
-	 * fill the tail bits with zeros, because the corresponding elements are not
-	 * valid.
-	 *
-	 */
-	memset(validity_bitmap, 0xFF, validity_bitmap_bytes);
-	if (n_total % 64)
-	{
-		const uint64 tail_mask = -1ULL >> (64 - n_total % 64);
-		validity_bitmap[n_total / 64] &= tail_mask;
-	}
-
+	uint64 *restrict validity_bitmap = NULL;
 	if (header->has_nulls)
 	{
+		/* Fill validity and indices of the array elements, reshuffling for nulls if needed. */
+		const int validity_bitmap_bytes = sizeof(uint64) * pad_to_multiple(64, n_total) / 64;
+		validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes);
+
+		/*
+		 * First, mark all data as valid, we will fill the nulls later if needed.
+		 * Note that the validity bitmap size is a multiple of 64 bits. We have to
+		 * fill the tail bits with zeros, because the corresponding elements are not
+		 * valid.
+		 *
+		 */
+		memset(validity_bitmap, 0xFF, validity_bitmap_bytes);
+		if (n_total % 64)
+		{
+			const uint64 tail_mask = ~0ULL >> (64 - n_total % 64);
+			validity_bitmap[n_total / 64] &= tail_mask;
+		}
+
 		/*
 		 * We have decompressed the data with nulls skipped, reshuffle it
 		 * according to the nulls bitmap.
diff --git a/tsl/src/compression/algorithms/gorilla_impl.c b/tsl/src/compression/algorithms/gorilla_impl.c
index 4ea4998a50b..85f6b6f512d 100644
--- a/tsl/src/compression/algorithms/gorilla_impl.c
+++ b/tsl/src/compression/algorithms/gorilla_impl.c
@@ -128,29 +128,30 @@ FUNCTION_NAME(gorilla_decompress_all, ELEMENT_TYPE)(CompressedGorillaData *goril
 		decompressed_values[i] = decompressed_values[simple8brle_bitmap_prefix_sum(&tag0s, i) - 1];
 	}
 
-	/*
-	 * We have unpacked the non-null data. Now reshuffle it to account for nulls,
-	 * and fill the validity bitmap.
-	 */
-	const int validity_bitmap_bytes = sizeof(uint64) * ((n_total + 64 - 1) / 64);
-	uint64 *restrict validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes);
-
-	/*
-	 * First, mark all data as valid, we will fill the nulls later if needed.
-	 * Note that the validity bitmap size is a multiple of 64 bits. We have to
-	 * fill the tail bits with zeros, because the corresponding elements are not
-	 * valid.
-	 *
-	 */
-	memset(validity_bitmap, 0xFF, validity_bitmap_bytes);
-	if (n_total % 64)
-	{
-		const uint64 tail_mask = -1ULL >> (64 - n_total % 64);
-		validity_bitmap[n_total / 64] &= tail_mask;
-	}
-
+	uint64 *restrict validity_bitmap = NULL;
 	if (has_nulls)
 	{
+		/*
+		 * We have unpacked the non-null data. Now reshuffle it to account for nulls,
+		 * and fill the validity bitmap.
+		 */
+		const int validity_bitmap_bytes = sizeof(uint64) * ((n_total + 64 - 1) / 64);
+		validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes);
+
+		/*
+		 * First, mark all data as valid, we will fill the nulls later if needed.
+		 * Note that the validity bitmap size is a multiple of 64 bits. We have to
+		 * fill the tail bits with zeros, because the corresponding elements are not
+		 * valid.
+		 *
+		 */
+		memset(validity_bitmap, 0xFF, validity_bitmap_bytes);
+		if (n_total % 64)
+		{
+			const uint64 tail_mask = ~0ULL >> (64 - n_total % 64);
+			validity_bitmap[n_total / 64] &= tail_mask;
+		}
+
 		/*
 		 * We have decompressed the data with nulls skipped, reshuffle it
 		 * according to the nulls bitmap.
diff --git a/tsl/src/compression/algorithms/simple8b_rle.h b/tsl/src/compression/algorithms/simple8b_rle.h
index ae08cdf4e88..be98d4d3a03 100644
--- a/tsl/src/compression/algorithms/simple8b_rle.h
+++ b/tsl/src/compression/algorithms/simple8b_rle.h
@@ -857,7 +857,7 @@ simple8brle_selector_get_bitmask(uint8 selector)
 {
 	uint8 bitLen = SIMPLE8B_BIT_LENGTH[selector];
 	Assert(bitLen != 0);
-	uint64 result = ((-1ULL) >> (64 - bitLen));
+	uint64 result = ((~0ULL) >> (64 - bitLen));
 	return result;
 }
 
diff --git a/tsl/src/compression/algorithms/simple8b_rle_bitmap.h b/tsl/src/compression/algorithms/simple8b_rle_bitmap.h
index 11f245ee781..9237a9151e8 100644
--- a/tsl/src/compression/algorithms/simple8b_rle_bitmap.h
+++ b/tsl/src/compression/algorithms/simple8b_rle_bitmap.h
@@ -148,7 +148,7 @@ simple8brle_bitmap_prefixsums(Simple8bRleSerialized *compressed)
 			const int elements_this_block = Min(64, num_elements - decompressed_index);
 			Assert(elements_this_block <= 64);
 			Assert(elements_this_block > 0);
-			block_data &= (-1ULL) >> (64 - elements_this_block);
+			block_data &= (~0ULL) >> (64 - elements_this_block);
 
 			/*
 			 * The number of block elements should fit within padding. Previous
@@ -161,7 +161,7 @@ simple8brle_bitmap_prefixsums(Simple8bRleSerialized *compressed)
 			for (uint16 i = 0; i < 64; i++)
 			{
 				const uint16 word_prefix_sum =
-					__builtin_popcountll(block_data & (-1ULL >> (63 - i)));
+					__builtin_popcountll(block_data & ((~0ULL) >> (63 - i)));
 				prefix_sums[decompressed_index + i] = num_ones + word_prefix_sum;
 			}
 			num_ones += __builtin_popcountll(block_data);
@@ -304,7 +304,7 @@ simple8brle_bitmap_decompress(Simple8bRleSerialized *compressed)
 			const int elements_this_block = Min(64, num_elements - decompressed_index);
 			Assert(elements_this_block <= 64);
 			Assert(elements_this_block > 0);
-			block_data &= (-1ULL) >> (64 - elements_this_block);
+			block_data &= (~0ULL) >> (64 - elements_this_block);
 
 			/*
 			 * The number of block elements should fit within padding. Previous
diff --git a/tsl/src/compression/arrow_c_data_interface.h b/tsl/src/compression/arrow_c_data_interface.h
index 5a34870dff0..436fed48c82 100644
--- a/tsl/src/compression/arrow_c_data_interface.h
+++ b/tsl/src/compression/arrow_c_data_interface.h
@@ -136,6 +136,11 @@ struct ArrowSchema
 static pg_attribute_always_inline bool
 arrow_row_is_valid(const uint64 *bitmap, size_t row_number)
 {
+	if (bitmap == NULL)
+	{
+		return true;
+	}
+
 	const size_t qword_index = row_number / 64;
 	const size_t bit_index = row_number % 64;
 	const uint64 mask = 1ull << bit_index;
@@ -164,6 +169,11 @@ pad_to_multiple(uint64 pad_to, uint64 source_value)
 static inline size_t
 arrow_num_valid(uint64 *bitmap, size_t total_rows)
 {
+	if (bitmap == NULL)
+	{
+		return total_rows;
+	}
+
 	uint64 num_valid = 0;
 #ifdef HAVE__BUILTIN_POPCOUNT
 	const uint64 words = pad_to_multiple(64, total_rows) / 64;
diff --git a/tsl/src/nodes/decompress_chunk/compressed_batch.c b/tsl/src/nodes/decompress_chunk/compressed_batch.c
index bdaab5d607f..0dc9b9b9f67 100644
--- a/tsl/src/nodes/decompress_chunk/compressed_batch.c
+++ b/tsl/src/nodes/decompress_chunk/compressed_batch.c
@@ -566,9 +566,16 @@ compute_plain_qual(DecompressContext *dcontext, DecompressBatchState *batch_stat
 		Assert((predicate_result != default_value_predicate_result) ||
 			   n_vector_result_words == 1); /* to placate Coverity. */
 		const uint64 *validity = (const uint64 *) vector->buffers[0];
-		for (size_t i = 0; i < n_vector_result_words; i++)
+		if (validity)
 		{
-			predicate_result[i] &= validity[i];
+			for (size_t i = 0; i < n_vector_result_words; i++)
+			{
+				predicate_result[i] &= validity[i];
+			}
+		}
+		else
+		{
+			Assert(vector->null_count == 0);
 		}
 	}
 
diff --git a/tsl/src/nodes/decompress_chunk/vector_predicates.c b/tsl/src/nodes/decompress_chunk/vector_predicates.c
index 46fd148001b..f1be155c3b1 100644
--- a/tsl/src/nodes/decompress_chunk/vector_predicates.c
+++ b/tsl/src/nodes/decompress_chunk/vector_predicates.c
@@ -74,13 +74,14 @@ vector_nulltest(const ArrowArray *arrow, int test_type, uint64 *restrict result)
 	const uint64 *validity = (const uint64 *) arrow->buffers[0];
 	for (uint16 i = 0; i < bitmap_words; i++)
 	{
+		const uint64 validity_word = validity != NULL ? validity[i] : ~0ULL;
 		if (should_be_null)
 		{
-			result[i] &= ~validity[i];
+			result[i] &= ~validity_word;
 		}
 		else
 		{
-			result[i] &= validity[i];
+			result[i] &= validity_word;
 		}
 	}
 }
diff --git a/tsl/src/nodes/decompress_chunk/vector_predicates.h b/tsl/src/nodes/decompress_chunk/vector_predicates.h
index 141563149dd..0fd3fa6f005 100644
--- a/tsl/src/nodes/decompress_chunk/vector_predicates.h
+++ b/tsl/src/nodes/decompress_chunk/vector_predicates.h
@@ -38,7 +38,7 @@ get_vector_qual_summary(uint64 *restrict qual_result, size_t n_rows)
 
 	if (n_rows % 64 != 0)
 	{
-		const uint64 last_word_mask = -1ULL >> (64 - n_rows % 64);
+		const uint64 last_word_mask = ~0ULL >> (64 - n_rows % 64);
 		any_rows_pass |= (qual_result[n_rows / 64] & last_word_mask) != 0;
 		all_rows_pass &= ((~qual_result[n_rows / 64]) & last_word_mask) == 0;
 	}
diff --git a/tsl/src/nodes/vector_agg/exec.c b/tsl/src/nodes/vector_agg/exec.c
index cc51b805e34..6f37218f33a 100644
--- a/tsl/src/nodes/vector_agg/exec.c
+++ b/tsl/src/nodes/vector_agg/exec.c
@@ -149,12 +149,9 @@ vector_agg_exec(CustomScanState *vector_agg_state)
 		 * column value, we need to multiply this value with the number of
 		 * passing decompressed tuples in this batch.
 		 */
-		int n = batch_state->total_batch_rows;
-		if (batch_state->vector_qual_result)
-		{
-			n = arrow_num_valid(batch_state->vector_qual_result, n);
-			Assert(n > 0);
-		}
+		const int n =
+			arrow_num_valid(batch_state->vector_qual_result, batch_state->total_batch_rows);
+		Assert(n > 0);
 
 		int offs = AttrNumberGetAttrOffset(value_column_description->custom_scan_attno);
 		agg->agg_const(batch_state->decompressed_scan_slot_data.base.tts_values[offs],
diff --git a/tsl/src/nodes/vector_agg/functions.c b/tsl/src/nodes/vector_agg/functions.c
index fa9a185c25b..60d82885e6a 100644
--- a/tsl/src/nodes/vector_agg/functions.c
+++ b/tsl/src/nodes/vector_agg/functions.c
@@ -44,30 +44,18 @@ int4_sum_vector(ArrowArray *vector, uint64 *filter, Datum *agg_value, bool *agg_
 	 */
 	Assert(vector->length <= INT_MAX);
 
-	int64 batch_sum = 0;
-
 	/*
-	 * This loop is not unrolled automatically, so do it manually as usual.
-	 * The value buffer is padded to an even multiple of 64 bytes, i.e. to
-	 * 64 / 4 = 16 elements. The bitmap is an even multiple of 64 elements.
-	 * The number of elements in the inner loop must be less than both these
-	 * values so that we don't go out of bounds. The particular value was
-	 * chosen because it gives some speedup, and the larger values blow up
-	 * the generated code with no performance benefit (checked on clang 16).
+	 * Note that we use a simplest loop here, there are many possibilities of
+	 * optimizing this function (for example, this loop is not unrolled by
+	 * clang-16).
 	 */
-#define INNER_LOOP_SIZE 4
-	const int outer_boundary = pad_to_multiple(INNER_LOOP_SIZE, vector->length);
-	for (int outer = 0; outer < outer_boundary; outer += INNER_LOOP_SIZE)
+	int64 batch_sum = 0;
+	for (int row = 0; row < vector->length; row++)
 	{
-		for (int inner = 0; inner < INNER_LOOP_SIZE; inner++)
-		{
-			const int row = outer + inner;
-			const int32 arrow_value = ((int32 *) vector->buffers[1])[row];
-			const bool passes_filter = filter ? arrow_row_is_valid(filter, row) : true;
-			batch_sum += passes_filter * arrow_value * arrow_row_is_valid(vector->buffers[0], row);
-		}
+		const int32 arrow_value = ((int32 *) vector->buffers[1])[row];
+		batch_sum += arrow_value * arrow_row_is_valid(filter, row) *
+					 arrow_row_is_valid(vector->buffers[0], row);
 	}
-#undef INNER_LOOP_SIZE
 
 	int64 tmp = DatumGetInt64(*agg_value);
 	if (unlikely(pg_add_s64_overflow(tmp, batch_sum, &tmp)))