diff --git a/.circleci/config.yml b/.circleci/config.yml index 00dd0c688955..597a66085dd5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -80,7 +80,8 @@ executors: jobs: macos-build: macos: - xcode: "11.7.0" + xcode: "12.4.0" + resource_class: large steps: - checkout - run: diff --git a/scripts/setup-helper-functions.sh b/scripts/setup-helper-functions.sh index 9faa82bff8d1..43120d9d6494 100644 --- a/scripts/setup-helper-functions.sh +++ b/scripts/setup-helper-functions.sh @@ -100,7 +100,7 @@ function get_cxx_flags { ;; "avx") - echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt -std=c++17" + echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt -mbmi2 -g -std=c++17" ;; "sse") diff --git a/velox/dwio/common/IntDecoder.cpp b/velox/dwio/common/IntDecoder.cpp index 11feda6fc6ae..fc7f7db1abc7 100644 --- a/velox/dwio/common/IntDecoder.cpp +++ b/velox/dwio/common/IntDecoder.cpp @@ -2405,6 +2405,198 @@ template void IntDecoder::bulkReadRows( int16_t* result, int32_t initialRow); +#if XSIMD_WITH_AVX2 +// Bit unpacking using BMI2 and AVX2. +typedef int32_t __m256si __attribute__((__vector_size__(32), __may_alias__)); + +typedef int32_t __m256si_u + __attribute__((__vector_size__(32), __may_alias__, __aligned__(1))); + +namespace { + +template +auto as4x64(__m256i x) { + return _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, i)); +} + +template +void store8Ints(__m256i eightInts, int32_t i, T* FOLLY_NONNULL result) { + if (sizeof(T) == 4) { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(result + i), eightInts); + } else { + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(result + i), as4x64<0>(eightInts)); + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(result + i + 4), as4x64<1>(eightInts)); + } +} + +template +inline T* addBytes(T* pointer, int32_t bytes) { + return reinterpret_cast(reinterpret_cast(pointer) + bytes); +} + +template +inline __m256i as256i(T x) { + return reinterpret_cast<__m256i>(x); +} + +template +inline __m256si as8x32(T x) { + return reinterpret_cast<__m256si>(x); +} + +template +FOLLY_ALWAYS_INLINE __m256i gather8Sparse( + const uint64_t* bits, + int32_t bitOffset, + const int32_t* rows, + int32_t i, + __m256si masks, + uint8_t width, + T* result) { + constexpr __m256si kMultipliers = {256, 128, 64, 32, 16, 8, 4, 2}; + + auto indices = + *reinterpret_cast(rows + i) * width + bitOffset; + __m256si multipliers; + if (width % 8 != 0) { + multipliers = (__m256si)_mm256_permutevar8x32_epi32( + as256i(kMultipliers), as256i(indices & 7)); + } + auto byteIndices = indices >> 3; + auto data = as8x32(_mm256_i32gather_epi32( + reinterpret_cast(bits), as256i(byteIndices), 1)); + if (width % 8 != 0) { + data = (data * multipliers) >> 8; + } + return as256i(data & masks); +} + +template +struct PdepMask { + static constexpr uint64_t kMask = bits::lowMask(width); + static constexpr uint64_t kMask2 = kMask | (kMask << 8); + static constexpr uint64_t kMask4 = kMask2 | (kMask2 << 16); + static constexpr uint64_t kDepMask8 = kMask4 | (kMask4 << 32); + static constexpr uint64_t kMask16 = kMask | (kMask << 16); + static constexpr uint64_t kDepMask16 = kMask16 | (kMask16 << 32); +}; + +uint64_t pdepMasks8[] = { + 0, + PdepMask<1>::kDepMask8, + PdepMask<2>::kDepMask8, + PdepMask<3>::kDepMask8, + PdepMask<4>::kDepMask8, + PdepMask<5>::kDepMask8, + PdepMask<6>::kDepMask8, + PdepMask<7>::kDepMask8, + PdepMask<8>::kDepMask8}; + +uint64_t pdepMasks16[] = { + PdepMask<9>::kDepMask16, + PdepMask<10>::kDepMask16, + PdepMask<11>::kDepMask16, + PdepMask<12>::kDepMask16, + PdepMask<13>::kDepMask16, + PdepMask<14>::kDepMask16, + PdepMask<15>::kDepMask16, + PdepMask<16>::kDepMask16}; + +template +int32_t decode1To24( + const uint64_t* bits, + int32_t bitOffset, + const int* rows, + int32_t numRows, + uint8_t width, + T* result) { + const auto masks = as8x32(_mm256_set1_epi32(bits::lowMask(width))); + __m256i eightInts; + int i = 0; + if (width <= 8) { + uint64_t depMask = pdepMasks8[width]; + for (; i + 8 <= numRows; i += 8) { + auto row = rows[i]; + auto endRow = rows[i + 7]; + if (endRow - row == 7) { + uint64_t eightBytes; + if (width == 8) { + if (!bitOffset) { + eightBytes = *addBytes(bits, row); + } else { + eightBytes = + bits::detail::loadBits(bits, bitOffset + 8 * row, 64); + } + } else { + auto bit = row * width + bitOffset; + auto byte = bit >> 3; + auto shift = bit & 7; + uint64_t word = *addBytes(bits, byte) >> shift; + eightBytes = _pdep_u64(word, depMask); + } + eightInts = _mm256_cvtepu8_epi32( + _mm_loadl_epi64(reinterpret_cast(&eightBytes))); + } else { + eightInts = + gather8Sparse(bits, bitOffset, rows, i, masks, width, result); + } + store8Ints(eightInts, i, result); + } + } else if (width <= 16) { + uint64_t depMask = pdepMasks16[width - 9]; + for (; i + 8 <= numRows; i += 8) { + auto row = rows[i]; + auto endRow = rows[i + 7]; + if (endRow - row == 7) { + // Use pdep to shift 2 words of bit packed data with width + // 9-16. For widts <= 14 four bit packed fields can always be + // loaded with a single uint64_t load. For 15 and 16 bits this + // depends on the start bit position. For either case we fill + // an array of 2x64 bits and widen that to a 8x32 word. + uint64_t words[2]; + if (width <= 14) { + auto bit = row * width + bitOffset; + auto byte = bit >> 3; + auto shift = bit & 7; + uint64_t word = *addBytes(bits, byte) >> shift; + words[0] = _pdep_u64(word, depMask); + bit += 4 * width; + byte = bit >> 3; + shift = bit & 7; + word = *addBytes(bits, byte) >> shift; + words[1] = _pdep_u64(word, depMask); + } else { + words[0] = bits::detail::loadBits( + bits, bitOffset + width * row, 64); + words[1] = bits::detail::loadBits( + bits, bitOffset + width * (row + 4), 64); + if (width == 15) { + words[0] = _pdep_u64(words[0], depMask); + words[1] = _pdep_u64(words[1], depMask); + } + } + eightInts = _mm256_cvtepu16_epi32( + _mm_load_si128(reinterpret_cast(&words))); + } else { + eightInts = + gather8Sparse(bits, bitOffset, rows, i, masks, width, result); + } + store8Ints(eightInts, i, result); + } + } else { + for (; i + 8 <= numRows; i += 8) { + auto row = rows[i]; + eightInts = gather8Sparse(bits, bitOffset, rows, i, masks, width, result); + store8Ints(eightInts, i, result); + } + } + return i; +} +} // namespace +#endif + template template // static @@ -2417,10 +2609,19 @@ void IntDecoder::decodeBitsLE( const char* bufferEnd, T* FOLLY_NONNULL result) { uint64_t mask = bits::lowMask(bitWidth); + // We subtract rowBias * bitWidth bits from the starting position. + bitOffset -= rowBias * bitWidth; + if (bitOffset < 0) { + // Decrement the pointer by enough bytes to have a non-negative bitOffset. + auto bytes = bits::roundUp(-bitOffset, 8) / 8; + bitOffset += bytes * 8; + bits = reinterpret_cast( + reinterpret_cast(bits) - bytes); + } auto numRows = rows.size(); if (bitWidth > 56) { for (auto i = 0; i < numRows; ++i) { - auto bit = bitOffset + (rows[i] - rowBias) * bitWidth; + auto bit = bitOffset + (rows[i]) * bitWidth; result[i] = bits::detail::loadBits(bits, bit, bitWidth) & mask; } return; @@ -2430,8 +2631,7 @@ void IntDecoder::decodeBitsLE( bool anyUnsafe = false; if (bufferEnd) { const char* endByte = reinterpret_cast(bits) + - bits::roundUp(bitOffset + (rows.back() - rowBias + 1) * bitWidth, 8) / - 8; + bits::roundUp(bitOffset + (rows.back() + 1) * bitWidth, 8) / 8; // redzone is the number of bytes at the end of the accessed range that // could overflow the buffer if accessed 64 its wide. int64_t redZone = @@ -2448,8 +2648,18 @@ void IntDecoder::decodeBitsLE( } } } - for (auto i = 0; i < numSafeRows; ++i) { - auto bit = bitOffset + (rows[i] - rowBias) * bitWidth; + int32_t i = 0; + +#if XSIMD_WITH_AVX2 + // Use AVX2 for specific widths. + if (bitWidth <= 24) { + i = decode1To24( + bits, bitOffset, rows.data(), numSafeRows, bitWidth, result); + } +#endif + + for (; i < numSafeRows; ++i) { + auto bit = bitOffset + (rows[i]) * bitWidth; auto byte = bit / 8; auto shift = bit & 7; result[i] = (*reinterpret_cast( @@ -2461,7 +2671,7 @@ void IntDecoder::decodeBitsLE( auto lastSafeWord = bufferEnd - sizeof(uint64_t); assert(lastSafeWord); // lint for (auto i = numSafeRows; i < numRows; ++i) { - auto bit = bitOffset + (rows[i] - rowBias) * bitWidth; + auto bit = bitOffset + (rows[i]) * bitWidth; auto byte = bit / 8; auto shift = bit & 7; result[i] = IntDecoder::safeLoadBits( diff --git a/velox/dwio/common/tests/CMakeLists.txt b/velox/dwio/common/tests/CMakeLists.txt index 26b51fa2c776..43cfcfa65911 100644 --- a/velox/dwio/common/tests/CMakeLists.txt +++ b/velox/dwio/common/tests/CMakeLists.txt @@ -17,6 +17,7 @@ add_subdirectory(utils) add_executable( velox_dwio_common_test BitConcatenationTest.cpp + DecodeBitsTest.cpp ChainedBufferTests.cpp DataBufferTests.cpp DecoderUtilTest.cpp diff --git a/velox/dwio/common/tests/DecodeBitsTest.cpp b/velox/dwio/common/tests/DecodeBitsTest.cpp new file mode 100644 index 000000000000..dc7a87b0acf9 --- /dev/null +++ b/velox/dwio/common/tests/DecodeBitsTest.cpp @@ -0,0 +1,135 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "velox/common/base/Nulls.h" +#include "velox/dwio/common/IntDecoder.h" + +#include + +using namespace facebook::velox::dwio::common; +using namespace facebook::velox; + +class DecodeBitsTest : public testing::Test { + protected: + void SetUp() { + for (int32_t i = 0; i < 100000; i++) { + auto randomInt = folly::Random::rand64(); + randomInts_.push_back(randomInt); + } + populateBitPackedData(); + allRowNumbers_.resize(randomInts_.size()); + std::iota(allRowNumbers_.begin(), allRowNumbers_.end(), 0); + oddRowNumbers_.resize(randomInts_.size() / 2); + for (auto i = 0; i < oddRowNumbers_.size(); i++) { + oddRowNumbers_[i] = i * 2 + 1; + } + allRows_ = RowSet(allRowNumbers_); + oddRows_ = RowSet(oddRowNumbers_); + } + + void populateBitPackedData() { + bitPackedData_.resize(32); + for (auto bitWidth = 1; bitWidth < 32; ++bitWidth) { + auto numWords = bits::roundUp(randomInts_.size() * bitWidth, 64) / 64; + bitPackedData_[bitWidth].resize(numWords); + auto source = randomInts_.data(); + auto destination = + reinterpret_cast(bitPackedData_[bitWidth].data()); + for (auto i = 0; i < randomInts_.size(); ++i) { + bits::copyBits( + source, + i * sizeof(*source) * 8, + destination, + i * bitWidth, + bitWidth); + } + } + } + + template + void checkDecodeResult( + const T* reference, + RowSet rows, + int8_t bitWidth, + const U* result) { + uint64_t mask = bits::lowMask(bitWidth); + for (auto i = 0; i < rows.size(); ++i) { + uint64_t original = reference[rows[i]] & mask; + ASSERT_EQ(original, result[i]) << " at " << i; + } + } + + template + void testDecodeRows(uint8_t width, RowSet rows) { + std::vector result(rows.size()); + int32_t start = 0; + + int32_t batch = 1; + // Read the encoding in progressively larger batches, each time 3x more than + // previous. + auto bits = bitPackedData_[width].data(); + do { + auto row = rows[start]; + int32_t bit = row * width; + auto byteOffset = bit / 8; + auto bitOffset = bit & 7; + auto numRows = std::min(start + batch, rows.size()) - start; + auto bitsPointer = reinterpret_cast( + reinterpret_cast(bits) + byteOffset); + + // end is the first unaddressable address after the bit packed data. We + // set this to be the byte of the last bit field to exercise the safe + // path. + auto end = reinterpret_cast(bitsPointer) + + (((start + rows[numRows - 1] - rows[start]) * width) / 8); + IntDecoder::decodeBitsLE( + bitsPointer, + bitOffset, + RowSet(&rows[start], numRows), + rows[start], + width, + end, + result.data() + start); + start += batch; + batch *= 3; + } while (start < rows.size()); + checkDecodeResult(randomInts_.data(), rows, width, result.data()); + } + + std::vector randomInts_; + + // All indices into 'randomInts_'. + std::vector allRowNumbers_; + + // Indices into odd positions in 'randomInts_'. + std::vector oddRowNumbers_; + + // Array of bit packed representations of randomInts_. The array at index i + // is packed i bits wide and the values come from the low bits of + std::vector> bitPackedData_; + RowSet allRows_; + RowSet oddRows_; +}; + +TEST_F(DecodeBitsTest, allWidths) { + for (auto width = 1; width < bitPackedData_.size(); ++width) { + testDecodeRows(width, allRows_); + testDecodeRows(width, allRows_); + testDecodeRows(width, oddRows_); + testDecodeRows(width, oddRows_); + } +} diff --git a/velox/dwio/common/tests/IntDecoderBenchmark.cpp b/velox/dwio/common/tests/IntDecoderBenchmark.cpp index fe969f1292e1..f636dc58bd10 100644 --- a/velox/dwio/common/tests/IntDecoderBenchmark.cpp +++ b/velox/dwio/common/tests/IntDecoderBenchmark.cpp @@ -23,9 +23,12 @@ #include "folly/lang/Bits.h" #include "velox/common/base/BitUtil.h" #include "velox/dwio/common/IntCodecCommon.h" +#include "velox/dwio/common/IntDecoder.h" #include "velox/dwio/common/exception/Exception.h" +using namespace facebook::velox; using namespace facebook::velox::dwio; +using namespace facebook::velox::dwio::common; namespace bits = facebook::velox::bits; const size_t kNumElements = 1000000; @@ -45,6 +48,17 @@ std::vector randomInts_u64; std::vector randomInts_u64_result; std::vector buffer_u64; +// Array of bit packed representations of randomInts_u32. The array at index i +// is packed i bits wide and the values come from the low bits of +std::vector> bitPackedData; + +std::vector result32; + +std::vector allRowNumbers; +std::vector oddRowNumbers; +RowSet allRows; +RowSet oddRows; + uint64_t readVuLong(const char* buffer, size_t& len) { if (LIKELY(len >= folly::kMaxVarintLength64)) { const char* p = buffer; @@ -935,6 +949,144 @@ BENCHMARK_RELATIVE(decodeNew_64) { randomInts_u64.size(), buffer_u64.data(), randomInts_u64_result.data()); } +// Naive unpacking, original version of IntDecoder::decodeBitsLE. +template +void naiveDecodeBitsLE( + const uint64_t* FOLLY_NONNULL bits, + int32_t bitOffset, + RowSet rows, + int32_t rowBias, + uint8_t bitWidth, + const char* bufferEnd, + T* FOLLY_NONNULL result) { + uint64_t mask = bits::lowMask(bitWidth); + auto numRows = rows.size(); + if (bitWidth > 56) { + for (auto i = 0; i < numRows; ++i) { + auto bit = bitOffset + (rows[i] - rowBias) * bitWidth; + result[i] = bits::detail::loadBits(bits, bit, bitWidth) & mask; + } + return; + } + auto FOLLY_NONNULL lastSafe = bufferEnd - sizeof(uint64_t); + int32_t numSafeRows = numRows; + bool anyUnsafe = false; + if (bufferEnd) { + const char* endByte = reinterpret_cast(bits) + + bits::roundUp(bitOffset + (rows.back() - rowBias + 1) * bitWidth, 8) / + 8; + // redzone is the number of bytes at the end of the accessed range that + // could overflow the buffer if accessed 64 its wide. + int64_t redZone = + sizeof(uint64_t) - static_cast(bufferEnd - endByte); + if (redZone > 0) { + anyUnsafe = true; + auto numRed = (redZone + 1) * 8 / bitWidth; + int32_t lastSafeIndex = rows.back() - numRed; + --numSafeRows; + for (; numSafeRows >= 1; --numSafeRows) { + if (rows[numSafeRows - 1] < lastSafeIndex) { + break; + } + } + } + } + for (auto i = 0; i < numSafeRows; ++i) { + auto bit = bitOffset + (rows[i] - rowBias) * bitWidth; + auto byte = bit / 8; + auto shift = bit & 7; + result[i] = (*reinterpret_cast( + reinterpret_cast(bits) + byte) >> + shift) & + mask; + } + if (anyUnsafe) { + auto lastSafeWord = bufferEnd - sizeof(uint64_t); + assert(lastSafeWord); // lint + for (auto i = numSafeRows; i < numRows; ++i) { + auto bit = bitOffset + (rows[i] - rowBias) * bitWidth; + auto byte = bit / 8; + auto shift = bit & 7; + result[i] = IntDecoder::safeLoadBits( + reinterpret_cast(bits) + byte, + shift, + bitWidth, + lastSafeWord) & + mask; + } + } +} + +template +void unpackNaive(RowSet rows, uint8_t bitWidth, T* result) { + auto data = bitPackedData[bitWidth].data(); + auto numBytes = bits::roundUp((rows.back() + 1) * bitWidth, 8) / 8; + auto end = reinterpret_cast(data) + numBytes; + naiveDecodeBitsLE(data, 0, rows, 0, bitWidth, end, result32.data()); +} + +template +void unpackFast(RowSet rows, uint8_t bitWidth, T* result) { + auto data = bitPackedData[bitWidth].data(); + auto numBytes = bits::roundUp((rows.back() + 1) * bitWidth, 8) / 8; + auto end = reinterpret_cast(data) + numBytes; + IntDecoder::decodeBitsLE( + data, + 0, + rows, + 0, + bitWidth, + end, + reinterpret_cast(result32.data())); +} + +#define BIT_BM_CASE_32(width) \ + BENCHMARK(unpackNaive##width##_32) { \ + unpackNaive(allRows, width, result32.data()); \ + } \ + \ + BENCHMARK_RELATIVE(unpackFast##width##_32) { \ + unpackFast(allRows, width, result32.data()); \ + } \ + \ + BENCHMARK_RELATIVE(unpackNaive##width##_32_odd) { \ + unpackNaive(oddRows, width, result32.data()); \ + } \ + \ + BENCHMARK_RELATIVE(unpackFast##width##_32_odd) { \ + unpackFast(oddRows, 7, result32.data()); \ + } + +BIT_BM_CASE_32(7) +BIT_BM_CASE_32(8) +BIT_BM_CASE_32(13) +BIT_BM_CASE_32(16) +BIT_BM_CASE_32(22) +BIT_BM_CASE_32(24) +BIT_BM_CASE_32(31) + +void populateBitPacked() { + bitPackedData.resize(32); + for (auto bitWidth = 2; bitWidth < 32; ++bitWidth) { + auto numWords = bits::roundUp(randomInts_u32.size() * bitWidth, 64) / 64; + bitPackedData[bitWidth].resize(numWords); + auto source = reinterpret_cast(randomInts_u32.data()); + auto destination = + reinterpret_cast(bitPackedData[bitWidth].data()); + for (auto i = 0; i < randomInts_u32.size(); ++i) { + bits::copyBits(source, i * 32, destination, i * bitWidth, bitWidth); + } + } + allRowNumbers.resize(randomInts_u32.size()); + std::iota(allRowNumbers.begin(), allRowNumbers.end(), 0); + oddRowNumbers.resize(randomInts_u32.size() / 2); + for (auto i = 0; i < oddRowNumbers.size(); i++) { + oddRowNumbers[i] = i * 2 + 1; + } + allRows = RowSet(allRowNumbers); + oddRows = RowSet(oddRowNumbers); +} + int32_t main(int32_t argc, char* argv[]) { folly::init(&argc, &argv); @@ -960,7 +1112,7 @@ int32_t main(int32_t argc, char* argv[]) { randomInts_u32_result.resize(randomInts_u32.size()); len_u32 = pos; - // Populate uint32 buffer + // Populate uint64 buffer buffer_u64.resize(kNumElements); pos = 0; for (int32_t i = 0; i < 100000; i++) { @@ -968,6 +1120,9 @@ int32_t main(int32_t argc, char* argv[]) { randomInts_u64.push_back(randomInt); pos = writeVulongToBuffer(randomInt, buffer_u64.data(), pos); } + populateBitPacked(); + result32.resize(randomInts_u32.size()); + randomInts_u64_result.resize(randomInts_u64.size()); len_u64 = pos;