Skip to content

Commit

Permalink
Null validity bitmap in ArrowArray (#7199)
Browse files Browse the repository at this point in the history
This is allowed when we have no nulls, simplifies the code a little, and
reduces memory usage.
  • Loading branch information
akuzm authored Aug 20, 2024
1 parent 32b35f3 commit 518cd47
Show file tree
Hide file tree
Showing 13 changed files with 117 additions and 110 deletions.
2 changes: 1 addition & 1 deletion src/adts/bit_array_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -348,5 +348,5 @@ static uint64
bit_array_low_bits_mask(uint8 bits_used)
{
Assert(bits_used > 0);
return -1ULL >> (64 - bits_used);
return ~0ULL >> (64 - bits_used);
}
35 changes: 18 additions & 17 deletions tsl/src/compression/algorithms/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -575,25 +575,26 @@ text_array_decompress_all_serialized_no_header(StringInfo si, bool has_nulls,
}
offsets[n_notnull] = offset;

const int validity_bitmap_bytes = sizeof(uint64) * (pad_to_multiple(64, n_total) / 64);
uint64 *restrict validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes);

/*
* First, mark all data as valid, we will fill the nulls later if needed.
* Note that the validity bitmap size is a multiple of 64 bits. We have to
* fill the tail bits with zeros, because the corresponding elements are not
* valid.
*
*/
memset(validity_bitmap, 0xFF, validity_bitmap_bytes);
if (n_total % 64)
{
const uint64 tail_mask = -1ULL >> (64 - n_total % 64);
validity_bitmap[n_total / 64] &= tail_mask;
}

uint64 *restrict validity_bitmap = NULL;
if (has_nulls)
{
const int validity_bitmap_bytes = sizeof(uint64) * (pad_to_multiple(64, n_total) / 64);
validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes);

/*
* First, mark all data as valid, we will fill the nulls later if needed.
* Note that the validity bitmap size is a multiple of 64 bits. We have to
* fill the tail bits with zeros, because the corresponding elements are not
* valid.
*
*/
memset(validity_bitmap, 0xFF, validity_bitmap_bytes);
if (n_total % 64)
{
const uint64 tail_mask = ~0ULL >> (64 - n_total % 64);
validity_bitmap[n_total / 64] &= tail_mask;
}

/*
* We have decompressed the data with nulls skipped, reshuffle it
* according to the nulls bitmap.
Expand Down
37 changes: 19 additions & 18 deletions tsl/src/compression/algorithms/deltadelta_impl.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,6 @@ FUNCTION_NAME(delta_delta_decompress_all, ELEMENT_TYPE)(Datum compressed, Memory
Assert(n_total >= n_notnull);
Assert(n_total <= GLOBAL_MAX_ROWS_PER_COMPRESSION);

const int validity_bitmap_bytes = sizeof(uint64) * ((n_total + 64 - 1) / 64);
uint64 *restrict validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes);

/*
* We need additional padding at the end of buffer, because the code that
* converts the elements to postres Datum always reads in 8 bytes.
Expand Down Expand Up @@ -91,23 +88,27 @@ FUNCTION_NAME(delta_delta_decompress_all, ELEMENT_TYPE)(Datum compressed, Memory
}
#undef INNER_LOOP_SIZE

/*
* First, mark all data as valid, we will fill the nulls later if needed.
* Note that the validity bitmap size is a multiple of 64 bits. We have to
* fill the tail bits with zeros, because the corresponding elements are not
* valid.
*
*/
memset(validity_bitmap, 0xFF, validity_bitmap_bytes);
if (n_total % 64)
{
const uint64 tail_mask = -1ULL >> (64 - n_total % 64);
validity_bitmap[n_total / 64] &= tail_mask;
}

/* Now move the data to account for nulls, and fill the validity bitmap. */
uint64 *restrict validity_bitmap = NULL;
if (has_nulls)
{
/* Now move the data to account for nulls, and fill the validity bitmap. */
const int validity_bitmap_bytes = sizeof(uint64) * ((n_total + 64 - 1) / 64);
validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes);

/*
* First, mark all data as valid, we will fill the nulls later if needed.
* Note that the validity bitmap size is a multiple of 64 bits. We have to
* fill the tail bits with zeros, because the corresponding elements are not
* valid.
*
*/
memset(validity_bitmap, 0xFF, validity_bitmap_bytes);
if (n_total % 64)
{
const uint64 tail_mask = ~0ULL >> (64 - n_total % 64);
validity_bitmap[n_total / 64] &= tail_mask;
}

/*
* The number of not-null elements we have must be consistent with the
* nulls bitmap.
Expand Down
37 changes: 19 additions & 18 deletions tsl/src/compression/algorithms/dictionary.c
Original file line number Diff line number Diff line change
Expand Up @@ -458,26 +458,27 @@ tsl_text_dictionary_decompress_all(Datum compressed, Oid element_type, MemoryCon
text_array_decompress_all_serialized_no_header(&si, /* has_nulls = */ false, dest_mctx);
CheckCompressedData(header->num_distinct == dict->length);

/* Fill validity and indices of the array elements, reshuffling for nulls if needed. */
const int validity_bitmap_bytes = sizeof(uint64) * pad_to_multiple(64, n_total) / 64;
uint64 *restrict validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes);

/*
* First, mark all data as valid, we will fill the nulls later if needed.
* Note that the validity bitmap size is a multiple of 64 bits. We have to
* fill the tail bits with zeros, because the corresponding elements are not
* valid.
*
*/
memset(validity_bitmap, 0xFF, validity_bitmap_bytes);
if (n_total % 64)
{
const uint64 tail_mask = -1ULL >> (64 - n_total % 64);
validity_bitmap[n_total / 64] &= tail_mask;
}

uint64 *restrict validity_bitmap = NULL;
if (header->has_nulls)
{
/* Fill validity and indices of the array elements, reshuffling for nulls if needed. */
const int validity_bitmap_bytes = sizeof(uint64) * pad_to_multiple(64, n_total) / 64;
validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes);

/*
* First, mark all data as valid, we will fill the nulls later if needed.
* Note that the validity bitmap size is a multiple of 64 bits. We have to
* fill the tail bits with zeros, because the corresponding elements are not
* valid.
*
*/
memset(validity_bitmap, 0xFF, validity_bitmap_bytes);
if (n_total % 64)
{
const uint64 tail_mask = ~0ULL >> (64 - n_total % 64);
validity_bitmap[n_total / 64] &= tail_mask;
}

/*
* We have decompressed the data with nulls skipped, reshuffle it
* according to the nulls bitmap.
Expand Down
43 changes: 22 additions & 21 deletions tsl/src/compression/algorithms/gorilla_impl.c
Original file line number Diff line number Diff line change
Expand Up @@ -128,29 +128,30 @@ FUNCTION_NAME(gorilla_decompress_all, ELEMENT_TYPE)(CompressedGorillaData *goril
decompressed_values[i] = decompressed_values[simple8brle_bitmap_prefix_sum(&tag0s, i) - 1];
}

/*
* We have unpacked the non-null data. Now reshuffle it to account for nulls,
* and fill the validity bitmap.
*/
const int validity_bitmap_bytes = sizeof(uint64) * ((n_total + 64 - 1) / 64);
uint64 *restrict validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes);

/*
* First, mark all data as valid, we will fill the nulls later if needed.
* Note that the validity bitmap size is a multiple of 64 bits. We have to
* fill the tail bits with zeros, because the corresponding elements are not
* valid.
*
*/
memset(validity_bitmap, 0xFF, validity_bitmap_bytes);
if (n_total % 64)
{
const uint64 tail_mask = -1ULL >> (64 - n_total % 64);
validity_bitmap[n_total / 64] &= tail_mask;
}

uint64 *restrict validity_bitmap = NULL;
if (has_nulls)
{
/*
* We have unpacked the non-null data. Now reshuffle it to account for nulls,
* and fill the validity bitmap.
*/
const int validity_bitmap_bytes = sizeof(uint64) * ((n_total + 64 - 1) / 64);
validity_bitmap = MemoryContextAlloc(dest_mctx, validity_bitmap_bytes);

/*
* First, mark all data as valid, we will fill the nulls later if needed.
* Note that the validity bitmap size is a multiple of 64 bits. We have to
* fill the tail bits with zeros, because the corresponding elements are not
* valid.
*
*/
memset(validity_bitmap, 0xFF, validity_bitmap_bytes);
if (n_total % 64)
{
const uint64 tail_mask = ~0ULL >> (64 - n_total % 64);
validity_bitmap[n_total / 64] &= tail_mask;
}

/*
* We have decompressed the data with nulls skipped, reshuffle it
* according to the nulls bitmap.
Expand Down
2 changes: 1 addition & 1 deletion tsl/src/compression/algorithms/simple8b_rle.h
Original file line number Diff line number Diff line change
Expand Up @@ -857,7 +857,7 @@ simple8brle_selector_get_bitmask(uint8 selector)
{
uint8 bitLen = SIMPLE8B_BIT_LENGTH[selector];
Assert(bitLen != 0);
uint64 result = ((-1ULL) >> (64 - bitLen));
uint64 result = ((~0ULL) >> (64 - bitLen));
return result;
}

Expand Down
6 changes: 3 additions & 3 deletions tsl/src/compression/algorithms/simple8b_rle_bitmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ simple8brle_bitmap_prefixsums(Simple8bRleSerialized *compressed)
const int elements_this_block = Min(64, num_elements - decompressed_index);
Assert(elements_this_block <= 64);
Assert(elements_this_block > 0);
block_data &= (-1ULL) >> (64 - elements_this_block);
block_data &= (~0ULL) >> (64 - elements_this_block);

/*
* The number of block elements should fit within padding. Previous
Expand All @@ -161,7 +161,7 @@ simple8brle_bitmap_prefixsums(Simple8bRleSerialized *compressed)
for (uint16 i = 0; i < 64; i++)
{
const uint16 word_prefix_sum =
__builtin_popcountll(block_data & (-1ULL >> (63 - i)));
__builtin_popcountll(block_data & ((~0ULL) >> (63 - i)));
prefix_sums[decompressed_index + i] = num_ones + word_prefix_sum;
}
num_ones += __builtin_popcountll(block_data);
Expand Down Expand Up @@ -304,7 +304,7 @@ simple8brle_bitmap_decompress(Simple8bRleSerialized *compressed)
const int elements_this_block = Min(64, num_elements - decompressed_index);
Assert(elements_this_block <= 64);
Assert(elements_this_block > 0);
block_data &= (-1ULL) >> (64 - elements_this_block);
block_data &= (~0ULL) >> (64 - elements_this_block);

/*
* The number of block elements should fit within padding. Previous
Expand Down
10 changes: 10 additions & 0 deletions tsl/src/compression/arrow_c_data_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,11 @@ struct ArrowSchema
static pg_attribute_always_inline bool
arrow_row_is_valid(const uint64 *bitmap, size_t row_number)
{
if (bitmap == NULL)
{
return true;
}

const size_t qword_index = row_number / 64;
const size_t bit_index = row_number % 64;
const uint64 mask = 1ull << bit_index;
Expand Down Expand Up @@ -164,6 +169,11 @@ pad_to_multiple(uint64 pad_to, uint64 source_value)
static inline size_t
arrow_num_valid(uint64 *bitmap, size_t total_rows)
{
if (bitmap == NULL)
{
return total_rows;
}

uint64 num_valid = 0;
#ifdef HAVE__BUILTIN_POPCOUNT
const uint64 words = pad_to_multiple(64, total_rows) / 64;
Expand Down
11 changes: 9 additions & 2 deletions tsl/src/nodes/decompress_chunk/compressed_batch.c
Original file line number Diff line number Diff line change
Expand Up @@ -566,9 +566,16 @@ compute_plain_qual(DecompressContext *dcontext, DecompressBatchState *batch_stat
Assert((predicate_result != default_value_predicate_result) ||
n_vector_result_words == 1); /* to placate Coverity. */
const uint64 *validity = (const uint64 *) vector->buffers[0];
for (size_t i = 0; i < n_vector_result_words; i++)
if (validity)
{
predicate_result[i] &= validity[i];
for (size_t i = 0; i < n_vector_result_words; i++)
{
predicate_result[i] &= validity[i];
}
}
else
{
Assert(vector->null_count == 0);
}
}

Expand Down
5 changes: 3 additions & 2 deletions tsl/src/nodes/decompress_chunk/vector_predicates.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,14 @@ vector_nulltest(const ArrowArray *arrow, int test_type, uint64 *restrict result)
const uint64 *validity = (const uint64 *) arrow->buffers[0];
for (uint16 i = 0; i < bitmap_words; i++)
{
const uint64 validity_word = validity != NULL ? validity[i] : ~0ULL;
if (should_be_null)
{
result[i] &= ~validity[i];
result[i] &= ~validity_word;
}
else
{
result[i] &= validity[i];
result[i] &= validity_word;
}
}
}
2 changes: 1 addition & 1 deletion tsl/src/nodes/decompress_chunk/vector_predicates.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ get_vector_qual_summary(uint64 *restrict qual_result, size_t n_rows)

if (n_rows % 64 != 0)
{
const uint64 last_word_mask = -1ULL >> (64 - n_rows % 64);
const uint64 last_word_mask = ~0ULL >> (64 - n_rows % 64);
any_rows_pass |= (qual_result[n_rows / 64] & last_word_mask) != 0;
all_rows_pass &= ((~qual_result[n_rows / 64]) & last_word_mask) == 0;
}
Expand Down
9 changes: 3 additions & 6 deletions tsl/src/nodes/vector_agg/exec.c
Original file line number Diff line number Diff line change
Expand Up @@ -149,12 +149,9 @@ vector_agg_exec(CustomScanState *vector_agg_state)
* column value, we need to multiply this value with the number of
* passing decompressed tuples in this batch.
*/
int n = batch_state->total_batch_rows;
if (batch_state->vector_qual_result)
{
n = arrow_num_valid(batch_state->vector_qual_result, n);
Assert(n > 0);
}
const int n =
arrow_num_valid(batch_state->vector_qual_result, batch_state->total_batch_rows);
Assert(n > 0);

int offs = AttrNumberGetAttrOffset(value_column_description->custom_scan_attno);
agg->agg_const(batch_state->decompressed_scan_slot_data.base.tts_values[offs],
Expand Down
28 changes: 8 additions & 20 deletions tsl/src/nodes/vector_agg/functions.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,30 +44,18 @@ int4_sum_vector(ArrowArray *vector, uint64 *filter, Datum *agg_value, bool *agg_
*/
Assert(vector->length <= INT_MAX);

int64 batch_sum = 0;

/*
* This loop is not unrolled automatically, so do it manually as usual.
* The value buffer is padded to an even multiple of 64 bytes, i.e. to
* 64 / 4 = 16 elements. The bitmap is an even multiple of 64 elements.
* The number of elements in the inner loop must be less than both these
* values so that we don't go out of bounds. The particular value was
* chosen because it gives some speedup, and the larger values blow up
* the generated code with no performance benefit (checked on clang 16).
* Note that we use a simplest loop here, there are many possibilities of
* optimizing this function (for example, this loop is not unrolled by
* clang-16).
*/
#define INNER_LOOP_SIZE 4
const int outer_boundary = pad_to_multiple(INNER_LOOP_SIZE, vector->length);
for (int outer = 0; outer < outer_boundary; outer += INNER_LOOP_SIZE)
int64 batch_sum = 0;
for (int row = 0; row < vector->length; row++)
{
for (int inner = 0; inner < INNER_LOOP_SIZE; inner++)
{
const int row = outer + inner;
const int32 arrow_value = ((int32 *) vector->buffers[1])[row];
const bool passes_filter = filter ? arrow_row_is_valid(filter, row) : true;
batch_sum += passes_filter * arrow_value * arrow_row_is_valid(vector->buffers[0], row);
}
const int32 arrow_value = ((int32 *) vector->buffers[1])[row];
batch_sum += arrow_value * arrow_row_is_valid(filter, row) *
arrow_row_is_valid(vector->buffers[0], row);
}
#undef INNER_LOOP_SIZE

int64 tmp = DatumGetInt64(*agg_value);
if (unlikely(pg_add_s64_overflow(tmp, batch_sum, &tmp)))
Expand Down

0 comments on commit 518cd47

Please sign in to comment.