Add faster bit unpacking

Uses BMI2 and AVX2 to unpack contiguous and non-contiguous accesses to bit packed data of width 24 or less. Contiguous runs of up to 16 bit fields are loaded 64 bits at a time and laid out in separate bytes or 16 bit words with pdep. The bytes/shorts are then widened to a vector of 8x32 and stored. A 64 bit target width widens the 8x32 to two 4x64 vectors. If the positions to load are not contiguous, the byte offsets and bit shifts are calculated as 8x32 vectors. The fields are read with a 8x32 gather. This data is in the lanes but if the bit width is not multiple of 8, a different shift has to be applied to each lane. This is done by multiplying the lanes by a vector of 8x32 where the multipliers are chosen by the bit shift applicable to each lane by permuting the multipliers by the bit shift vector. Now all the lanes are aligned and can be shifted 8 bits down and extra bits can be anded off. Contiguous fields of more than 16 bits are loaded with gather since pdep would be getting only 2 or 3 values at a time. A benchmark compares the fast path with a naive implementation. The acceleration is between 3-6x. In TPCH with Parquet, processing of bit fields goes down from ~7% to ~2.5% of profile in velox_tpch_benchmark at scale 10.
facebookincubator · Aug 23, 2022 · eef5683 · eef5683
1 parent e257e7a
commit eef5683
Show file tree

Hide file tree

Showing 5 changed files with 509 additions and 8 deletions.
diff --git a/scripts/setup-helper-functions.sh b/scripts/setup-helper-functions.sh
@@ -100,7 +100,7 @@ function get_cxx_flags {
     ;;
 
     "avx")
-      echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt -std=c++17"
+      echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt -mbmi2 -g -std=c++17"
     ;;
 
     "sse")

diff --git a/velox/dwio/common/IntDecoder.cpp b/velox/dwio/common/IntDecoder.cpp
@@ -2405,6 +2405,198 @@ template void IntDecoder<false>::bulkReadRows(
     int16_t* result,
     int32_t initialRow);
 
+#if XSIMD_WITH_AVX2
+// Bit unpacking using BMI2 and AVX2.
+typedef int32_t __m256si __attribute__((__vector_size__(32), __may_alias__));
+
+typedef int32_t __m256si_u
+    __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
+
+namespace {
+
+template <int8_t i>
+auto as4x64(__m256i x) {
+  return _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, i));
+}
+
+template <typename T>
+void store8Ints(__m256i eightInts, int32_t i, T* FOLLY_NONNULL result) {
+  if (sizeof(T) == 4) {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(result + i), eightInts);
+  } else {
+    _mm256_storeu_si256(
+        reinterpret_cast<__m256i*>(result + i), as4x64<0>(eightInts));
+    _mm256_storeu_si256(
+        reinterpret_cast<__m256i*>(result + i + 4), as4x64<1>(eightInts));
+  }
+}
+
+template <typename T>
+inline T* addBytes(T* pointer, int32_t bytes) {
+  return reinterpret_cast<T*>(reinterpret_cast<uint64_t>(pointer) + bytes);
+}
+
+template <typename T>
+inline __m256i as256i(T x) {
+  return reinterpret_cast<__m256i>(x);
+}
+
+template <typename T>
+inline __m256si as8x32(T x) {
+  return reinterpret_cast<__m256si>(x);
+}
+
+template <typename T>
+FOLLY_ALWAYS_INLINE __m256i gather8Sparse(
+    const uint64_t* bits,
+    int32_t bitOffset,
+    const int32_t* rows,
+    int32_t i,
+    __m256si masks,
+    uint8_t width,
+    T* result) {
+  constexpr __m256si kMultipliers = {256, 128, 64, 32, 16, 8, 4, 2};
+
+  auto indices =
+      *reinterpret_cast<const __m256si_u*>(rows + i) * width + bitOffset;
+  __m256si multipliers;
+  if (width % 8 != 0) {
+    multipliers = (__m256si)_mm256_permutevar8x32_epi32(
+        as256i(kMultipliers), as256i(indices & 7));
+  }
+  auto byteIndices = indices >> 3;
+  auto data = as8x32(_mm256_i32gather_epi32(
+      reinterpret_cast<const int*>(bits), as256i(byteIndices), 1));
+  if (width % 8 != 0) {
+    data = (data * multipliers) >> 8;
+  }
+  return as256i(data & masks);
+}
+
+template <uint8_t width>
+struct PdepMask {
+  static constexpr uint64_t kMask = bits::lowMask(width);
+  static constexpr uint64_t kMask2 = kMask | (kMask << 8);
+  static constexpr uint64_t kMask4 = kMask2 | (kMask2 << 16);
+  static constexpr uint64_t kDepMask8 = kMask4 | (kMask4 << 32);
+  static constexpr uint64_t kMask16 = kMask | (kMask << 16);
+  static constexpr uint64_t kDepMask16 = kMask16 | (kMask16 << 32);
+};
+
+uint64_t pdepMasks8[] = {
+    0,
+    PdepMask<1>::kDepMask8,
+    PdepMask<2>::kDepMask8,
+    PdepMask<3>::kDepMask8,
+    PdepMask<4>::kDepMask8,
+    PdepMask<5>::kDepMask8,
+    PdepMask<6>::kDepMask8,
+    PdepMask<7>::kDepMask8,
+    PdepMask<8>::kDepMask8};
+
+uint64_t pdepMasks16[] = {
+    PdepMask<9>::kDepMask16,
+    PdepMask<10>::kDepMask16,
+    PdepMask<11>::kDepMask16,
+    PdepMask<12>::kDepMask16,
+    PdepMask<13>::kDepMask16,
+    PdepMask<14>::kDepMask16,
+    PdepMask<15>::kDepMask16,
+    PdepMask<16>::kDepMask16};
+
+template <typename T>
+int32_t decode1To24(
+    const uint64_t* bits,
+    int32_t bitOffset,
+    const int* rows,
+    int32_t numRows,
+    uint8_t width,
+    T* result) {
+  const auto masks = as8x32(_mm256_set1_epi32(bits::lowMask(width)));
+  __m256i eightInts;
+  int i = 0;
+  if (width <= 8) {
+    uint64_t depMask = pdepMasks8[width];
+    for (; i + 8 <= numRows; i += 8) {
+      auto row = rows[i];
+      auto endRow = rows[i + 7];
+      if (endRow - row == 7) {
+        uint64_t eightBytes;
+        if (width == 8) {
+          if (!bitOffset) {
+            eightBytes = *addBytes(bits, row);
+          } else {
+            eightBytes =
+                bits::detail::loadBits<uint64_t>(bits, bitOffset + 8 * row, 64);
+          }
+        } else {
+          auto bit = row * width + bitOffset;
+          auto byte = bit >> 3;
+          auto shift = bit & 7;
+          uint64_t word = *addBytes(bits, byte) >> shift;
+          eightBytes = _pdep_u64(word, depMask);
+        }
+        eightInts = _mm256_cvtepu8_epi32(
+            _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&eightBytes)));
+      } else {
+        eightInts =
+            gather8Sparse(bits, bitOffset, rows, i, masks, width, result);
+      }
+      store8Ints(eightInts, i, result);
+    }
+  } else if (width <= 16) {
+    uint64_t depMask = pdepMasks16[width - 9];
+    for (; i + 8 <= numRows; i += 8) {
+      auto row = rows[i];
+      auto endRow = rows[i + 7];
+      if (endRow - row == 7) {
+        // Use pdep to shift 2 words of bit packed data with width
+        // 9-16. For widts <= 14 four bit packed fields can always be
+        // loaded with a single uint64_t load. For 15 and 16 bits this
+        // depends on the start bit position. For either case we fill
+        // an array of 2x64 bits and widen that to a 8x32 word.
+        uint64_t words[2];
+        if (width <= 14) {
+          auto bit = row * width + bitOffset;
+          auto byte = bit >> 3;
+          auto shift = bit & 7;
+          uint64_t word = *addBytes(bits, byte) >> shift;
+          words[0] = _pdep_u64(word, depMask);
+          bit += 4 * width;
+          byte = bit >> 3;
+          shift = bit & 7;
+          word = *addBytes(bits, byte) >> shift;
+          words[1] = _pdep_u64(word, depMask);
+        } else {
+          words[0] = bits::detail::loadBits<uint64_t>(
+              bits, bitOffset + width * row, 64);
+          words[1] = bits::detail::loadBits<uint64_t>(
+              bits, bitOffset + width * (row + 4), 64);
+          if (width == 15) {
+            words[0] = _pdep_u64(words[0], depMask);
+            words[1] = _pdep_u64(words[1], depMask);
+          }
+        }
+        eightInts = _mm256_cvtepu16_epi32(
+            _mm_load_si128(reinterpret_cast<const __m128i*>(&words)));
+      } else {
+        eightInts =
+            gather8Sparse(bits, bitOffset, rows, i, masks, width, result);
+      }
+      store8Ints(eightInts, i, result);
+    }
+  } else {
+    for (; i + 8 <= numRows; i += 8) {
+      auto row = rows[i];
+      eightInts = gather8Sparse(bits, bitOffset, rows, i, masks, width, result);
+      store8Ints(eightInts, i, result);
+    }
+  }
+  return i;
+}
+} // namespace
+#endif
+
 template <bool isSigned>
 template <typename T>
 // static
@@ -2417,10 +2609,19 @@ void IntDecoder<isSigned>::decodeBitsLE(
     const char* bufferEnd,
     T* FOLLY_NONNULL result) {
   uint64_t mask = bits::lowMask(bitWidth);
+  // We subtract rowBias * bitWidth bits from the starting position.
+  bitOffset -= rowBias * bitWidth;
+  if (bitOffset < 0) {
+    // Decrement the pointer by enough bytes to have a non-negative bitOffset.
+    auto bytes = bits::roundUp(-bitOffset, 8) / 8;
+    bitOffset += bytes * 8;
+    bits = reinterpret_cast<const uint64_t*>(
+        reinterpret_cast<const char*>(bits) - bytes);
+  }
   auto numRows = rows.size();
   if (bitWidth > 56) {
     for (auto i = 0; i < numRows; ++i) {
-      auto bit = bitOffset + (rows[i] - rowBias) * bitWidth;
+      auto bit = bitOffset + (rows[i]) * bitWidth;
       result[i] = bits::detail::loadBits<T>(bits, bit, bitWidth) & mask;
     }
     return;
@@ -2430,8 +2631,7 @@ void IntDecoder<isSigned>::decodeBitsLE(
   bool anyUnsafe = false;
   if (bufferEnd) {
     const char* endByte = reinterpret_cast<const char*>(bits) +
-        bits::roundUp(bitOffset + (rows.back() - rowBias + 1) * bitWidth, 8) /
-            8;
+        bits::roundUp(bitOffset + (rows.back() + 1) * bitWidth, 8) / 8;
     // redzone is the number of bytes at the end of the accessed range that
     // could overflow the buffer if accessed 64 its wide.
     int64_t redZone =
@@ -2448,8 +2648,18 @@ void IntDecoder<isSigned>::decodeBitsLE(
       }
     }
   }
-  for (auto i = 0; i < numSafeRows; ++i) {
-    auto bit = bitOffset + (rows[i] - rowBias) * bitWidth;
+  int32_t i = 0;
+
+#if XSIMD_WITH_AVX2
+  // Use AVX2 for specific widths.
+  if (bitWidth <= 24) {
+    i = decode1To24(
+        bits, bitOffset, rows.data(), numSafeRows, bitWidth, result);
+  }
+#endif
+
+  for (; i < numSafeRows; ++i) {
+    auto bit = bitOffset + (rows[i]) * bitWidth;
     auto byte = bit / 8;
     auto shift = bit & 7;
     result[i] = (*reinterpret_cast<const uint64_t*>(
@@ -2461,7 +2671,7 @@ void IntDecoder<isSigned>::decodeBitsLE(
     auto lastSafeWord = bufferEnd - sizeof(uint64_t);
     assert(lastSafeWord); // lint
     for (auto i = numSafeRows; i < numRows; ++i) {
-      auto bit = bitOffset + (rows[i] - rowBias) * bitWidth;
+      auto bit = bitOffset + (rows[i]) * bitWidth;
       auto byte = bit / 8;
       auto shift = bit & 7;
       result[i] = IntDecoder<isSigned>::safeLoadBits(

diff --git a/velox/dwio/common/tests/CMakeLists.txt b/velox/dwio/common/tests/CMakeLists.txt
@@ -17,6 +17,7 @@ add_subdirectory(utils)
 add_executable(
   velox_dwio_common_test
   BitConcatenationTest.cpp
+  DecodeBitsTest.cpp
   ChainedBufferTests.cpp
   DataBufferTests.cpp
   DecoderUtilTest.cpp