Skip to content

Commit

Permalink
Add faster bit unpacking
Browse files Browse the repository at this point in the history
Uses BMI2 and AVX2 to unpack contiguous and non-contiguous accesses to
bit packed data of width 24 or less.

Contiguous runs of up to 16 bit fields are loaded 64 bits at a time
and laid out in separate bytes or 16 bit words with pdep. The
bytes/shorts are then widened to a vector of 8x32 and stored. A 64 bit
target width widens the 8x32 to two 4x64 vectors.

If the positions to load are not contiguous, the byte offsets and bit
shifts are calculated as 8x32 vectors. The fields are read with a 8x32
gather. This data is in the lanes but if the bit width is not multiple
of 8, a different shift has to be applied to each lane. This is done
by multiplying the lanes by a vector of 8x32 where the multipliers are
chosen by the bit shift applicable to each lane by permuting the
multipliers by the bit shift vector. Now all the lanes are aligned and
can be shifted 8 bits down and extra bits can be anded off.

Contiguous fields of more than 16 bits are loaded with gather since pdep would be getting only 2 or 3 values at a time.

A benchmark compares the fast path with a naive implementation. The
acceleration is between 3-6x.

In TPCH with Parquet, processing of bit fields goes down from ~7% to
~2.5% of profile in velox_tpch_benchmark at scale 10.
  • Loading branch information
Orri Erling committed Aug 24, 2022
1 parent 663a2e9 commit b3f0492
Show file tree
Hide file tree
Showing 6 changed files with 496 additions and 9 deletions.
3 changes: 2 additions & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ executors:
jobs:
macos-build:
macos:
xcode: "11.7.0"
xcode: "12.5.1"
resource_class: large
steps:
- checkout
- run:
Expand Down
2 changes: 1 addition & 1 deletion scripts/setup-helper-functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ function get_cxx_flags {
;;

"avx")
echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt -std=c++17"
echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt -mbmi2 -std=c++17"
;;

"sse")
Expand Down
207 changes: 201 additions & 6 deletions velox/dwio/common/IntDecoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2405,6 +2405,159 @@ template void IntDecoder<false>::bulkReadRows(
int16_t* result,
int32_t initialRow);

#if XSIMD_WITH_AVX2
// Bit unpacking using BMI2 and AVX2.
typedef int32_t __m256si __attribute__((__vector_size__(32), __may_alias__));

typedef int32_t __m256si_u
__attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));

namespace {

template <int8_t i>
auto as4x64(__m256i x) {
return _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, i));
}

template <typename T>
void store8Ints(__m256i eightInts, int32_t i, T* FOLLY_NONNULL result) {
if (sizeof(T) == 4) {
_mm256_storeu_si256(reinterpret_cast<__m256i*>(result + i), eightInts);
} else {
_mm256_storeu_si256(
reinterpret_cast<__m256i*>(result + i), as4x64<0>(eightInts));
_mm256_storeu_si256(
reinterpret_cast<__m256i*>(result + i + 4), as4x64<1>(eightInts));
}
}

template <typename T>
inline T* addBytes(T* pointer, int32_t bytes) {
return reinterpret_cast<T*>(reinterpret_cast<uint64_t>(pointer) + bytes);
}

template <typename T>
inline __m256i as256i(T x) {
return reinterpret_cast<__m256i>(x);
}

template <typename T>
inline __m256si as8x32(T x) {
return reinterpret_cast<__m256si>(x);
}

template <uint8_t width, typename T>
FOLLY_ALWAYS_INLINE __m256i gather8Sparse(
const uint64_t* bits,
int32_t bitOffset,
const int32_t* rows,
int32_t i,
__m256si masks,
T* result) {
constexpr __m256si kMultipliers = {256, 128, 64, 32, 16, 8, 4, 2};

auto indices =
*reinterpret_cast<const __m256si_u*>(rows + i) * width + bitOffset;
__m256si multipliers;
if (width % 8 != 0) {
multipliers = (__m256si)_mm256_permutevar8x32_epi32(
as256i(kMultipliers), as256i(indices & 7));
}
auto byteIndices = indices >> 3;
auto data = as8x32(_mm256_i32gather_epi32(
reinterpret_cast<const int*>(bits), as256i(byteIndices), 1));
if (width % 8 != 0) {
data = (data * multipliers) >> 8;
}
return as256i(data & masks);
}

template <uint8_t width, typename T>
int32_t decode1To24(
const uint64_t* bits,
int32_t bitOffset,
const int* rows,
int32_t numRows,
T* result) {
constexpr uint64_t kMask = bits::lowMask(width);
constexpr uint64_t kMask2 = kMask | (kMask << 8);
constexpr uint64_t kMask4 = kMask2 | (kMask2 << 16);
constexpr uint64_t kDepMask8 = kMask4 | (kMask4 << 32);
constexpr uint64_t kMask16 = kMask | (kMask << 16);
constexpr uint64_t kDepMask16 = kMask16 | (kMask16 << 32);
int32_t i = 0;
const auto masks = as8x32(_mm256_set1_epi32(kMask));
for (; i + 8 <= numRows; i += 8) {
auto row = rows[i];
auto endRow = rows[i + 7];
__m256i eightInts;
if (width <= 16 && endRow - row == 7) {
// Special cases for 8 contiguous values with <= 16 bits.
if (width <= 8) {
uint64_t eightBytes;
if (width == 8) {
if (!bitOffset) {
eightBytes = *addBytes(bits, row);
} else {
eightBytes =
bits::detail::loadBits<uint64_t>(bits, bitOffset + 8 * row, 64);
}
} else {
auto bit = row * width + bitOffset;
auto byte = bit >> 3;
auto shift = bit & 7;
uint64_t word = *addBytes(bits, byte) >> shift;
eightBytes = _pdep_u64(word, kDepMask8);
}
eightInts = _mm256_cvtepu8_epi32(
_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&eightBytes)));
} else {
// Use pdep to shift 2 words of bit packed data with width
// 9-16. For widts <= 14 four bit packed fields can always be
// loaded with a single uint64_t load. For 15 and 16 bits this
// depends on the start bit position. For either case we fill
// an array of 2x64 bits and widen that to a 8x32 word.
uint64_t words[2];
if (width <= 14) {
auto bit = row * width + bitOffset;
auto byte = bit >> 3;
auto shift = bit & 7;
uint64_t word = *addBytes(bits, byte) >> shift;
words[0] = _pdep_u64(word, kDepMask16);
bit += 4 * width;
byte = bit >> 3;
shift = bit & 7;
word = *addBytes(bits, byte) >> shift;
words[1] = _pdep_u64(word, kDepMask16);
} else {
words[0] = bits::detail::loadBits<uint64_t>(
bits, bitOffset + width * row, 64);
words[1] = bits::detail::loadBits<uint64_t>(
bits, bitOffset + width * (row + 4), 64);
if (width == 15) {
words[0] = _pdep_u64(words[0], kDepMask16);
words[1] = _pdep_u64(words[1], kDepMask16);
}
}
eightInts = _mm256_cvtepu16_epi32(
_mm_load_si128(reinterpret_cast<const __m128i*>(&words)));
}
} else {
eightInts = gather8Sparse<width>(bits, bitOffset, rows, i, masks, result);
}
store8Ints(eightInts, i, result);
}
return i;
}

} // namespace
#endif

#define WIDTH_CASE(width) \
case width: \
i = decode1To24<width>(bits, bitOffset, rows.data(), numSafeRows, result); \
break;

template <bool isSigned>
template <typename T>
// static
Expand All @@ -2417,10 +2570,19 @@ void IntDecoder<isSigned>::decodeBitsLE(
const char* bufferEnd,
T* FOLLY_NONNULL result) {
uint64_t mask = bits::lowMask(bitWidth);
// We subtract rowBias * bitWidth bits from the starting position.
bitOffset -= rowBias * bitWidth;
if (bitOffset < 0) {
// Decrement the pointer by enough bytes to have a non-negative bitOffset.
auto bytes = bits::roundUp(-bitOffset, 8) / 8;
bitOffset += bytes * 8;
bits = reinterpret_cast<const uint64_t*>(
reinterpret_cast<const char*>(bits) - bytes);
}
auto numRows = rows.size();
if (bitWidth > 56) {
for (auto i = 0; i < numRows; ++i) {
auto bit = bitOffset + (rows[i] - rowBias) * bitWidth;
auto bit = bitOffset + (rows[i]) * bitWidth;
result[i] = bits::detail::loadBits<T>(bits, bit, bitWidth) & mask;
}
return;
Expand All @@ -2430,8 +2592,7 @@ void IntDecoder<isSigned>::decodeBitsLE(
bool anyUnsafe = false;
if (bufferEnd) {
const char* endByte = reinterpret_cast<const char*>(bits) +
bits::roundUp(bitOffset + (rows.back() - rowBias + 1) * bitWidth, 8) /
8;
bits::roundUp(bitOffset + (rows.back() + 1) * bitWidth, 8) / 8;
// redzone is the number of bytes at the end of the accessed range that
// could overflow the buffer if accessed 64 its wide.
int64_t redZone =
Expand All @@ -2448,8 +2609,42 @@ void IntDecoder<isSigned>::decodeBitsLE(
}
}
}
for (auto i = 0; i < numSafeRows; ++i) {
auto bit = bitOffset + (rows[i] - rowBias) * bitWidth;
int32_t i = 0;

#if XSIMD_WITH_AVX2
// Use AVX2 for specific widths.
switch (bitWidth) {
WIDTH_CASE(1);
WIDTH_CASE(2);
WIDTH_CASE(3);
WIDTH_CASE(4);
WIDTH_CASE(5);
WIDTH_CASE(6);
WIDTH_CASE(7);
WIDTH_CASE(8);
WIDTH_CASE(9);
WIDTH_CASE(10);
WIDTH_CASE(11);
WIDTH_CASE(12);
WIDTH_CASE(13);
WIDTH_CASE(14);
WIDTH_CASE(15);
WIDTH_CASE(16);
WIDTH_CASE(17);
WIDTH_CASE(18);
WIDTH_CASE(19);
WIDTH_CASE(20);
WIDTH_CASE(21);
WIDTH_CASE(22);
WIDTH_CASE(23);
WIDTH_CASE(24);
default:
break;
}
#endif

for (; i < numSafeRows; ++i) {
auto bit = bitOffset + (rows[i]) * bitWidth;
auto byte = bit / 8;
auto shift = bit & 7;
result[i] = (*reinterpret_cast<const uint64_t*>(
Expand All @@ -2461,7 +2656,7 @@ void IntDecoder<isSigned>::decodeBitsLE(
auto lastSafeWord = bufferEnd - sizeof(uint64_t);
assert(lastSafeWord); // lint
for (auto i = numSafeRows; i < numRows; ++i) {
auto bit = bitOffset + (rows[i] - rowBias) * bitWidth;
auto bit = bitOffset + (rows[i]) * bitWidth;
auto byte = bit / 8;
auto shift = bit & 7;
result[i] = IntDecoder<isSigned>::safeLoadBits(
Expand Down
1 change: 1 addition & 0 deletions velox/dwio/common/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ add_subdirectory(utils)
add_executable(
velox_dwio_common_test
BitConcatenationTest.cpp
DecodeBitsTest.cpp
ChainedBufferTests.cpp
DataBufferTests.cpp
DecoderUtilTest.cpp
Expand Down
Loading

0 comments on commit b3f0492

Please sign in to comment.