From 518cd4794b9c0369453f8267a841876ea157d7f7 Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov <36882414+akuzm@users.noreply.github.com> Date: Tue, 20 Aug 2024 13:30:59 +0200 Subject: [PATCH] Null validity bitmap in ArrowArray (#7199) This is allowed when we have no nulls, simplifies the code a little, and reduces memory usage. --- src/adts/bit_array_impl.h | 2 +- tsl/src/compression/algorithms/array.c | 35 +++++++-------- .../compression/algorithms/deltadelta_impl.c | 37 ++++++++-------- tsl/src/compression/algorithms/dictionary.c | 37 ++++++++-------- tsl/src/compression/algorithms/gorilla_impl.c | 43 ++++++++++--------- tsl/src/compression/algorithms/simple8b_rle.h | 2 +- .../algorithms/simple8b_rle_bitmap.h | 6 +-- tsl/src/compression/arrow_c_data_interface.h | 10 +++++ .../nodes/decompress_chunk/compressed_batch.c | 11 ++++- .../decompress_chunk/vector_predicates.c | 5 ++- .../decompress_chunk/vector_predicates.h | 2 +- tsl/src/nodes/vector_agg/exec.c | 9 ++-- tsl/src/nodes/vector_agg/functions.c | 28 ++++-------- 13 files changed, 117 insertions(+), 110 deletions(-) diff --git a/src/adts/bit_array_impl.h b/src/adts/bit_array_impl.h index 5a8005139c8..dbdb1a98c0c 100644 --- a/src/adts/bit_array_impl.h +++ b/src/adts/bit_array_impl.h @@ -348,5 +348,5 @@ static uint64 bit_array_low_bits_mask(uint8 bits_used) { Assert(bits_used > 0); - return -1ULL >> (64 - bits_used); + return ~0ULL >> (64 - bits_used); } diff --git a/tsl/src/compression/algorithms/array.c b/tsl/src/compression/algorithms/array.c index ef3216274cc..674131e2339 100644 --- a/tsl/src/compression/algorithms/array.c +++ b/tsl/src/compression/algorithms/array.c @@ -575,25 +575,26 @@ text_array_decompress_all_serialized_no_header(StringInfo si, bool has_nulls, } offsets[n_notnull] = offset; - const int validity_bitmap_bytes = sizeof(uint64) * (pad_to_multiple(64, n_total) / 64); - uint64 *restrict validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes); - - /* - * First, mark all data as valid, we will fill the nulls later if needed. - * Note that the validity bitmap size is a multiple of 64 bits. We have to - * fill the tail bits with zeros, because the corresponding elements are not - * valid. - * - */ - memset(validity_bitmap, 0xFF, validity_bitmap_bytes); - if (n_total % 64) - { - const uint64 tail_mask = -1ULL >> (64 - n_total % 64); - validity_bitmap[n_total / 64] &= tail_mask; - } - + uint64 *restrict validity_bitmap = NULL; if (has_nulls) { + const int validity_bitmap_bytes = sizeof(uint64) * (pad_to_multiple(64, n_total) / 64); + validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes); + + /* + * First, mark all data as valid, we will fill the nulls later if needed. + * Note that the validity bitmap size is a multiple of 64 bits. We have to + * fill the tail bits with zeros, because the corresponding elements are not + * valid. + * + */ + memset(validity_bitmap, 0xFF, validity_bitmap_bytes); + if (n_total % 64) + { + const uint64 tail_mask = ~0ULL >> (64 - n_total % 64); + validity_bitmap[n_total / 64] &= tail_mask; + } + /* * We have decompressed the data with nulls skipped, reshuffle it * according to the nulls bitmap. diff --git a/tsl/src/compression/algorithms/deltadelta_impl.c b/tsl/src/compression/algorithms/deltadelta_impl.c index 2673570134e..6e036a5722b 100644 --- a/tsl/src/compression/algorithms/deltadelta_impl.c +++ b/tsl/src/compression/algorithms/deltadelta_impl.c @@ -55,9 +55,6 @@ FUNCTION_NAME(delta_delta_decompress_all, ELEMENT_TYPE)(Datum compressed, Memory Assert(n_total >= n_notnull); Assert(n_total <= GLOBAL_MAX_ROWS_PER_COMPRESSION); - const int validity_bitmap_bytes = sizeof(uint64) * ((n_total + 64 - 1) / 64); - uint64 *restrict validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes); - /* * We need additional padding at the end of buffer, because the code that * converts the elements to postres Datum always reads in 8 bytes. @@ -91,23 +88,27 @@ FUNCTION_NAME(delta_delta_decompress_all, ELEMENT_TYPE)(Datum compressed, Memory } #undef INNER_LOOP_SIZE - /* - * First, mark all data as valid, we will fill the nulls later if needed. - * Note that the validity bitmap size is a multiple of 64 bits. We have to - * fill the tail bits with zeros, because the corresponding elements are not - * valid. - * - */ - memset(validity_bitmap, 0xFF, validity_bitmap_bytes); - if (n_total % 64) - { - const uint64 tail_mask = -1ULL >> (64 - n_total % 64); - validity_bitmap[n_total / 64] &= tail_mask; - } - - /* Now move the data to account for nulls, and fill the validity bitmap. */ + uint64 *restrict validity_bitmap = NULL; if (has_nulls) { + /* Now move the data to account for nulls, and fill the validity bitmap. */ + const int validity_bitmap_bytes = sizeof(uint64) * ((n_total + 64 - 1) / 64); + validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes); + + /* + * First, mark all data as valid, we will fill the nulls later if needed. + * Note that the validity bitmap size is a multiple of 64 bits. We have to + * fill the tail bits with zeros, because the corresponding elements are not + * valid. + * + */ + memset(validity_bitmap, 0xFF, validity_bitmap_bytes); + if (n_total % 64) + { + const uint64 tail_mask = ~0ULL >> (64 - n_total % 64); + validity_bitmap[n_total / 64] &= tail_mask; + } + /* * The number of not-null elements we have must be consistent with the * nulls bitmap. diff --git a/tsl/src/compression/algorithms/dictionary.c b/tsl/src/compression/algorithms/dictionary.c index c13bc828b49..a090447c933 100644 --- a/tsl/src/compression/algorithms/dictionary.c +++ b/tsl/src/compression/algorithms/dictionary.c @@ -458,26 +458,27 @@ tsl_text_dictionary_decompress_all(Datum compressed, Oid element_type, MemoryCon text_array_decompress_all_serialized_no_header(&si, /* has_nulls = */ false, dest_mctx); CheckCompressedData(header->num_distinct == dict->length); - /* Fill validity and indices of the array elements, reshuffling for nulls if needed. */ - const int validity_bitmap_bytes = sizeof(uint64) * pad_to_multiple(64, n_total) / 64; - uint64 *restrict validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes); - - /* - * First, mark all data as valid, we will fill the nulls later if needed. - * Note that the validity bitmap size is a multiple of 64 bits. We have to - * fill the tail bits with zeros, because the corresponding elements are not - * valid. - * - */ - memset(validity_bitmap, 0xFF, validity_bitmap_bytes); - if (n_total % 64) - { - const uint64 tail_mask = -1ULL >> (64 - n_total % 64); - validity_bitmap[n_total / 64] &= tail_mask; - } - + uint64 *restrict validity_bitmap = NULL; if (header->has_nulls) { + /* Fill validity and indices of the array elements, reshuffling for nulls if needed. */ + const int validity_bitmap_bytes = sizeof(uint64) * pad_to_multiple(64, n_total) / 64; + validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes); + + /* + * First, mark all data as valid, we will fill the nulls later if needed. + * Note that the validity bitmap size is a multiple of 64 bits. We have to + * fill the tail bits with zeros, because the corresponding elements are not + * valid. + * + */ + memset(validity_bitmap, 0xFF, validity_bitmap_bytes); + if (n_total % 64) + { + const uint64 tail_mask = ~0ULL >> (64 - n_total % 64); + validity_bitmap[n_total / 64] &= tail_mask; + } + /* * We have decompressed the data with nulls skipped, reshuffle it * according to the nulls bitmap. diff --git a/tsl/src/compression/algorithms/gorilla_impl.c b/tsl/src/compression/algorithms/gorilla_impl.c index 4ea4998a50b..85f6b6f512d 100644 --- a/tsl/src/compression/algorithms/gorilla_impl.c +++ b/tsl/src/compression/algorithms/gorilla_impl.c @@ -128,29 +128,30 @@ FUNCTION_NAME(gorilla_decompress_all, ELEMENT_TYPE)(CompressedGorillaData *goril decompressed_values[i] = decompressed_values[simple8brle_bitmap_prefix_sum(&tag0s, i) - 1]; } - /* - * We have unpacked the non-null data. Now reshuffle it to account for nulls, - * and fill the validity bitmap. - */ - const int validity_bitmap_bytes = sizeof(uint64) * ((n_total + 64 - 1) / 64); - uint64 *restrict validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes); - - /* - * First, mark all data as valid, we will fill the nulls later if needed. - * Note that the validity bitmap size is a multiple of 64 bits. We have to - * fill the tail bits with zeros, because the corresponding elements are not - * valid. - * - */ - memset(validity_bitmap, 0xFF, validity_bitmap_bytes); - if (n_total % 64) - { - const uint64 tail_mask = -1ULL >> (64 - n_total % 64); - validity_bitmap[n_total / 64] &= tail_mask; - } - + uint64 *restrict validity_bitmap = NULL; if (has_nulls) { + /* + * We have unpacked the non-null data. Now reshuffle it to account for nulls, + * and fill the validity bitmap. + */ + const int validity_bitmap_bytes = sizeof(uint64) * ((n_total + 64 - 1) / 64); + validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes); + + /* + * First, mark all data as valid, we will fill the nulls later if needed. + * Note that the validity bitmap size is a multiple of 64 bits. We have to + * fill the tail bits with zeros, because the corresponding elements are not + * valid. + * + */ + memset(validity_bitmap, 0xFF, validity_bitmap_bytes); + if (n_total % 64) + { + const uint64 tail_mask = ~0ULL >> (64 - n_total % 64); + validity_bitmap[n_total / 64] &= tail_mask; + } + /* * We have decompressed the data with nulls skipped, reshuffle it * according to the nulls bitmap. diff --git a/tsl/src/compression/algorithms/simple8b_rle.h b/tsl/src/compression/algorithms/simple8b_rle.h index ae08cdf4e88..be98d4d3a03 100644 --- a/tsl/src/compression/algorithms/simple8b_rle.h +++ b/tsl/src/compression/algorithms/simple8b_rle.h @@ -857,7 +857,7 @@ simple8brle_selector_get_bitmask(uint8 selector) { uint8 bitLen = SIMPLE8B_BIT_LENGTH[selector]; Assert(bitLen != 0); - uint64 result = ((-1ULL) >> (64 - bitLen)); + uint64 result = ((~0ULL) >> (64 - bitLen)); return result; } diff --git a/tsl/src/compression/algorithms/simple8b_rle_bitmap.h b/tsl/src/compression/algorithms/simple8b_rle_bitmap.h index 11f245ee781..9237a9151e8 100644 --- a/tsl/src/compression/algorithms/simple8b_rle_bitmap.h +++ b/tsl/src/compression/algorithms/simple8b_rle_bitmap.h @@ -148,7 +148,7 @@ simple8brle_bitmap_prefixsums(Simple8bRleSerialized *compressed) const int elements_this_block = Min(64, num_elements - decompressed_index); Assert(elements_this_block <= 64); Assert(elements_this_block > 0); - block_data &= (-1ULL) >> (64 - elements_this_block); + block_data &= (~0ULL) >> (64 - elements_this_block); /* * The number of block elements should fit within padding. Previous @@ -161,7 +161,7 @@ simple8brle_bitmap_prefixsums(Simple8bRleSerialized *compressed) for (uint16 i = 0; i < 64; i++) { const uint16 word_prefix_sum = - __builtin_popcountll(block_data & (-1ULL >> (63 - i))); + __builtin_popcountll(block_data & ((~0ULL) >> (63 - i))); prefix_sums[decompressed_index + i] = num_ones + word_prefix_sum; } num_ones += __builtin_popcountll(block_data); @@ -304,7 +304,7 @@ simple8brle_bitmap_decompress(Simple8bRleSerialized *compressed) const int elements_this_block = Min(64, num_elements - decompressed_index); Assert(elements_this_block <= 64); Assert(elements_this_block > 0); - block_data &= (-1ULL) >> (64 - elements_this_block); + block_data &= (~0ULL) >> (64 - elements_this_block); /* * The number of block elements should fit within padding. Previous diff --git a/tsl/src/compression/arrow_c_data_interface.h b/tsl/src/compression/arrow_c_data_interface.h index 5a34870dff0..436fed48c82 100644 --- a/tsl/src/compression/arrow_c_data_interface.h +++ b/tsl/src/compression/arrow_c_data_interface.h @@ -136,6 +136,11 @@ struct ArrowSchema static pg_attribute_always_inline bool arrow_row_is_valid(const uint64 *bitmap, size_t row_number) { + if (bitmap == NULL) + { + return true; + } + const size_t qword_index = row_number / 64; const size_t bit_index = row_number % 64; const uint64 mask = 1ull << bit_index; @@ -164,6 +169,11 @@ pad_to_multiple(uint64 pad_to, uint64 source_value) static inline size_t arrow_num_valid(uint64 *bitmap, size_t total_rows) { + if (bitmap == NULL) + { + return total_rows; + } + uint64 num_valid = 0; #ifdef HAVE__BUILTIN_POPCOUNT const uint64 words = pad_to_multiple(64, total_rows) / 64; diff --git a/tsl/src/nodes/decompress_chunk/compressed_batch.c b/tsl/src/nodes/decompress_chunk/compressed_batch.c index bdaab5d607f..0dc9b9b9f67 100644 --- a/tsl/src/nodes/decompress_chunk/compressed_batch.c +++ b/tsl/src/nodes/decompress_chunk/compressed_batch.c @@ -566,9 +566,16 @@ compute_plain_qual(DecompressContext *dcontext, DecompressBatchState *batch_stat Assert((predicate_result != default_value_predicate_result) || n_vector_result_words == 1); /* to placate Coverity. */ const uint64 *validity = (const uint64 *) vector->buffers[0]; - for (size_t i = 0; i < n_vector_result_words; i++) + if (validity) { - predicate_result[i] &= validity[i]; + for (size_t i = 0; i < n_vector_result_words; i++) + { + predicate_result[i] &= validity[i]; + } + } + else + { + Assert(vector->null_count == 0); } } diff --git a/tsl/src/nodes/decompress_chunk/vector_predicates.c b/tsl/src/nodes/decompress_chunk/vector_predicates.c index 46fd148001b..f1be155c3b1 100644 --- a/tsl/src/nodes/decompress_chunk/vector_predicates.c +++ b/tsl/src/nodes/decompress_chunk/vector_predicates.c @@ -74,13 +74,14 @@ vector_nulltest(const ArrowArray *arrow, int test_type, uint64 *restrict result) const uint64 *validity = (const uint64 *) arrow->buffers[0]; for (uint16 i = 0; i < bitmap_words; i++) { + const uint64 validity_word = validity != NULL ? validity[i] : ~0ULL; if (should_be_null) { - result[i] &= ~validity[i]; + result[i] &= ~validity_word; } else { - result[i] &= validity[i]; + result[i] &= validity_word; } } } diff --git a/tsl/src/nodes/decompress_chunk/vector_predicates.h b/tsl/src/nodes/decompress_chunk/vector_predicates.h index 141563149dd..0fd3fa6f005 100644 --- a/tsl/src/nodes/decompress_chunk/vector_predicates.h +++ b/tsl/src/nodes/decompress_chunk/vector_predicates.h @@ -38,7 +38,7 @@ get_vector_qual_summary(uint64 *restrict qual_result, size_t n_rows) if (n_rows % 64 != 0) { - const uint64 last_word_mask = -1ULL >> (64 - n_rows % 64); + const uint64 last_word_mask = ~0ULL >> (64 - n_rows % 64); any_rows_pass |= (qual_result[n_rows / 64] & last_word_mask) != 0; all_rows_pass &= ((~qual_result[n_rows / 64]) & last_word_mask) == 0; } diff --git a/tsl/src/nodes/vector_agg/exec.c b/tsl/src/nodes/vector_agg/exec.c index cc51b805e34..6f37218f33a 100644 --- a/tsl/src/nodes/vector_agg/exec.c +++ b/tsl/src/nodes/vector_agg/exec.c @@ -149,12 +149,9 @@ vector_agg_exec(CustomScanState *vector_agg_state) * column value, we need to multiply this value with the number of * passing decompressed tuples in this batch. */ - int n = batch_state->total_batch_rows; - if (batch_state->vector_qual_result) - { - n = arrow_num_valid(batch_state->vector_qual_result, n); - Assert(n > 0); - } + const int n = + arrow_num_valid(batch_state->vector_qual_result, batch_state->total_batch_rows); + Assert(n > 0); int offs = AttrNumberGetAttrOffset(value_column_description->custom_scan_attno); agg->agg_const(batch_state->decompressed_scan_slot_data.base.tts_values[offs], diff --git a/tsl/src/nodes/vector_agg/functions.c b/tsl/src/nodes/vector_agg/functions.c index fa9a185c25b..60d82885e6a 100644 --- a/tsl/src/nodes/vector_agg/functions.c +++ b/tsl/src/nodes/vector_agg/functions.c @@ -44,30 +44,18 @@ int4_sum_vector(ArrowArray *vector, uint64 *filter, Datum *agg_value, bool *agg_ */ Assert(vector->length <= INT_MAX); - int64 batch_sum = 0; - /* - * This loop is not unrolled automatically, so do it manually as usual. - * The value buffer is padded to an even multiple of 64 bytes, i.e. to - * 64 / 4 = 16 elements. The bitmap is an even multiple of 64 elements. - * The number of elements in the inner loop must be less than both these - * values so that we don't go out of bounds. The particular value was - * chosen because it gives some speedup, and the larger values blow up - * the generated code with no performance benefit (checked on clang 16). + * Note that we use a simplest loop here, there are many possibilities of + * optimizing this function (for example, this loop is not unrolled by + * clang-16). */ -#define INNER_LOOP_SIZE 4 - const int outer_boundary = pad_to_multiple(INNER_LOOP_SIZE, vector->length); - for (int outer = 0; outer < outer_boundary; outer += INNER_LOOP_SIZE) + int64 batch_sum = 0; + for (int row = 0; row < vector->length; row++) { - for (int inner = 0; inner < INNER_LOOP_SIZE; inner++) - { - const int row = outer + inner; - const int32 arrow_value = ((int32 *) vector->buffers[1])[row]; - const bool passes_filter = filter ? arrow_row_is_valid(filter, row) : true; - batch_sum += passes_filter * arrow_value * arrow_row_is_valid(vector->buffers[0], row); - } + const int32 arrow_value = ((int32 *) vector->buffers[1])[row]; + batch_sum += arrow_value * arrow_row_is_valid(filter, row) * + arrow_row_is_valid(vector->buffers[0], row); } -#undef INNER_LOOP_SIZE int64 tmp = DatumGetInt64(*agg_value); if (unlikely(pg_add_s64_overflow(tmp, batch_sum, &tmp)))