Skip to content

Commit

Permalink
Add faster bit unpacking
Browse files Browse the repository at this point in the history
Uses BMI2 and AVX2 to unpack contiguous and non-contiguous accesses to
bit packed data of width 24 or less.

Contiguous runs of up to 16 bit fields are loaded 64 bits at a time
and laid out in separate bytes or 16 bit words with pdep. The
bytes/shorts are then widened to a vector of 8x32 and stored. A 64 bit
target width widens the 8x32 to two 4x64 vectors.

If the positions to load are not contiguous, the byte offsets and bit
shifts are calculated as 8x32 vectors. The fields are read with a 8x32
gather. This data is in the lanes but if the bit width is not multiple
of 8, a different shift has to be applied to each lane. This is done
by multiplying the lanes by a vector of 8x32 where the multipliers are
chosen by the bit shift applicable to each lane by permuting the
multipliers by the bit shift vector. Now all the lanes are aligned and
can be shifted 8 bits down and extra bits can be anded off.

Contiguous fields of more than 16 bits are loaded with gather since pdep would be getting only 2 or 3 values at a time.

A benchmark compares the fast path with a naive implementation. The
acceleration is between 3-6x.

In TPCH with Parquet, processing of bit fields goes down from ~7% to
~2.5% of profile in velox_tpch_benchmark at scale 10.
  • Loading branch information
Orri Erling committed Aug 23, 2022
1 parent e257e7a commit eef5683
Show file tree
Hide file tree
Showing 5 changed files with 509 additions and 8 deletions.
2 changes: 1 addition & 1 deletion scripts/setup-helper-functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ function get_cxx_flags {
;;

"avx")
echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt -std=c++17"
echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt -mbmi2 -g -std=c++17"
;;

"sse")
Expand Down
222 changes: 216 additions & 6 deletions velox/dwio/common/IntDecoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2405,6 +2405,198 @@ template void IntDecoder<false>::bulkReadRows(
int16_t* result,
int32_t initialRow);

#if XSIMD_WITH_AVX2
// Bit unpacking using BMI2 and AVX2.
typedef int32_t __m256si __attribute__((__vector_size__(32), __may_alias__));

typedef int32_t __m256si_u
__attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));

namespace {

template <int8_t i>
auto as4x64(__m256i x) {
return _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, i));
}

template <typename T>
void store8Ints(__m256i eightInts, int32_t i, T* FOLLY_NONNULL result) {
if (sizeof(T) == 4) {
_mm256_storeu_si256(reinterpret_cast<__m256i*>(result + i), eightInts);
} else {
_mm256_storeu_si256(
reinterpret_cast<__m256i*>(result + i), as4x64<0>(eightInts));
_mm256_storeu_si256(
reinterpret_cast<__m256i*>(result + i + 4), as4x64<1>(eightInts));
}
}

template <typename T>
inline T* addBytes(T* pointer, int32_t bytes) {
return reinterpret_cast<T*>(reinterpret_cast<uint64_t>(pointer) + bytes);
}

template <typename T>
inline __m256i as256i(T x) {
return reinterpret_cast<__m256i>(x);
}

template <typename T>
inline __m256si as8x32(T x) {
return reinterpret_cast<__m256si>(x);
}

template <typename T>
FOLLY_ALWAYS_INLINE __m256i gather8Sparse(
const uint64_t* bits,
int32_t bitOffset,
const int32_t* rows,
int32_t i,
__m256si masks,
uint8_t width,
T* result) {
constexpr __m256si kMultipliers = {256, 128, 64, 32, 16, 8, 4, 2};

auto indices =
*reinterpret_cast<const __m256si_u*>(rows + i) * width + bitOffset;
__m256si multipliers;
if (width % 8 != 0) {
multipliers = (__m256si)_mm256_permutevar8x32_epi32(
as256i(kMultipliers), as256i(indices & 7));
}
auto byteIndices = indices >> 3;
auto data = as8x32(_mm256_i32gather_epi32(
reinterpret_cast<const int*>(bits), as256i(byteIndices), 1));
if (width % 8 != 0) {
data = (data * multipliers) >> 8;
}
return as256i(data & masks);
}

template <uint8_t width>
struct PdepMask {
static constexpr uint64_t kMask = bits::lowMask(width);
static constexpr uint64_t kMask2 = kMask | (kMask << 8);
static constexpr uint64_t kMask4 = kMask2 | (kMask2 << 16);
static constexpr uint64_t kDepMask8 = kMask4 | (kMask4 << 32);
static constexpr uint64_t kMask16 = kMask | (kMask << 16);
static constexpr uint64_t kDepMask16 = kMask16 | (kMask16 << 32);
};

uint64_t pdepMasks8[] = {
0,
PdepMask<1>::kDepMask8,
PdepMask<2>::kDepMask8,
PdepMask<3>::kDepMask8,
PdepMask<4>::kDepMask8,
PdepMask<5>::kDepMask8,
PdepMask<6>::kDepMask8,
PdepMask<7>::kDepMask8,
PdepMask<8>::kDepMask8};

uint64_t pdepMasks16[] = {
PdepMask<9>::kDepMask16,
PdepMask<10>::kDepMask16,
PdepMask<11>::kDepMask16,
PdepMask<12>::kDepMask16,
PdepMask<13>::kDepMask16,
PdepMask<14>::kDepMask16,
PdepMask<15>::kDepMask16,
PdepMask<16>::kDepMask16};

template <typename T>
int32_t decode1To24(
const uint64_t* bits,
int32_t bitOffset,
const int* rows,
int32_t numRows,
uint8_t width,
T* result) {
const auto masks = as8x32(_mm256_set1_epi32(bits::lowMask(width)));
__m256i eightInts;
int i = 0;
if (width <= 8) {
uint64_t depMask = pdepMasks8[width];
for (; i + 8 <= numRows; i += 8) {
auto row = rows[i];
auto endRow = rows[i + 7];
if (endRow - row == 7) {
uint64_t eightBytes;
if (width == 8) {
if (!bitOffset) {
eightBytes = *addBytes(bits, row);
} else {
eightBytes =
bits::detail::loadBits<uint64_t>(bits, bitOffset + 8 * row, 64);
}
} else {
auto bit = row * width + bitOffset;
auto byte = bit >> 3;
auto shift = bit & 7;
uint64_t word = *addBytes(bits, byte) >> shift;
eightBytes = _pdep_u64(word, depMask);
}
eightInts = _mm256_cvtepu8_epi32(
_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&eightBytes)));
} else {
eightInts =
gather8Sparse(bits, bitOffset, rows, i, masks, width, result);
}
store8Ints(eightInts, i, result);
}
} else if (width <= 16) {
uint64_t depMask = pdepMasks16[width - 9];
for (; i + 8 <= numRows; i += 8) {
auto row = rows[i];
auto endRow = rows[i + 7];
if (endRow - row == 7) {
// Use pdep to shift 2 words of bit packed data with width
// 9-16. For widts <= 14 four bit packed fields can always be
// loaded with a single uint64_t load. For 15 and 16 bits this
// depends on the start bit position. For either case we fill
// an array of 2x64 bits and widen that to a 8x32 word.
uint64_t words[2];
if (width <= 14) {
auto bit = row * width + bitOffset;
auto byte = bit >> 3;
auto shift = bit & 7;
uint64_t word = *addBytes(bits, byte) >> shift;
words[0] = _pdep_u64(word, depMask);
bit += 4 * width;
byte = bit >> 3;
shift = bit & 7;
word = *addBytes(bits, byte) >> shift;
words[1] = _pdep_u64(word, depMask);
} else {
words[0] = bits::detail::loadBits<uint64_t>(
bits, bitOffset + width * row, 64);
words[1] = bits::detail::loadBits<uint64_t>(
bits, bitOffset + width * (row + 4), 64);
if (width == 15) {
words[0] = _pdep_u64(words[0], depMask);
words[1] = _pdep_u64(words[1], depMask);
}
}
eightInts = _mm256_cvtepu16_epi32(
_mm_load_si128(reinterpret_cast<const __m128i*>(&words)));
} else {
eightInts =
gather8Sparse(bits, bitOffset, rows, i, masks, width, result);
}
store8Ints(eightInts, i, result);
}
} else {
for (; i + 8 <= numRows; i += 8) {
auto row = rows[i];
eightInts = gather8Sparse(bits, bitOffset, rows, i, masks, width, result);
store8Ints(eightInts, i, result);
}
}
return i;
}
} // namespace
#endif

template <bool isSigned>
template <typename T>
// static
Expand All @@ -2417,10 +2609,19 @@ void IntDecoder<isSigned>::decodeBitsLE(
const char* bufferEnd,
T* FOLLY_NONNULL result) {
uint64_t mask = bits::lowMask(bitWidth);
// We subtract rowBias * bitWidth bits from the starting position.
bitOffset -= rowBias * bitWidth;
if (bitOffset < 0) {
// Decrement the pointer by enough bytes to have a non-negative bitOffset.
auto bytes = bits::roundUp(-bitOffset, 8) / 8;
bitOffset += bytes * 8;
bits = reinterpret_cast<const uint64_t*>(
reinterpret_cast<const char*>(bits) - bytes);
}
auto numRows = rows.size();
if (bitWidth > 56) {
for (auto i = 0; i < numRows; ++i) {
auto bit = bitOffset + (rows[i] - rowBias) * bitWidth;
auto bit = bitOffset + (rows[i]) * bitWidth;
result[i] = bits::detail::loadBits<T>(bits, bit, bitWidth) & mask;
}
return;
Expand All @@ -2430,8 +2631,7 @@ void IntDecoder<isSigned>::decodeBitsLE(
bool anyUnsafe = false;
if (bufferEnd) {
const char* endByte = reinterpret_cast<const char*>(bits) +
bits::roundUp(bitOffset + (rows.back() - rowBias + 1) * bitWidth, 8) /
8;
bits::roundUp(bitOffset + (rows.back() + 1) * bitWidth, 8) / 8;
// redzone is the number of bytes at the end of the accessed range that
// could overflow the buffer if accessed 64 its wide.
int64_t redZone =
Expand All @@ -2448,8 +2648,18 @@ void IntDecoder<isSigned>::decodeBitsLE(
}
}
}
for (auto i = 0; i < numSafeRows; ++i) {
auto bit = bitOffset + (rows[i] - rowBias) * bitWidth;
int32_t i = 0;

#if XSIMD_WITH_AVX2
// Use AVX2 for specific widths.
if (bitWidth <= 24) {
i = decode1To24(
bits, bitOffset, rows.data(), numSafeRows, bitWidth, result);
}
#endif

for (; i < numSafeRows; ++i) {
auto bit = bitOffset + (rows[i]) * bitWidth;
auto byte = bit / 8;
auto shift = bit & 7;
result[i] = (*reinterpret_cast<const uint64_t*>(
Expand All @@ -2461,7 +2671,7 @@ void IntDecoder<isSigned>::decodeBitsLE(
auto lastSafeWord = bufferEnd - sizeof(uint64_t);
assert(lastSafeWord); // lint
for (auto i = numSafeRows; i < numRows; ++i) {
auto bit = bitOffset + (rows[i] - rowBias) * bitWidth;
auto bit = bitOffset + (rows[i]) * bitWidth;
auto byte = bit / 8;
auto shift = bit & 7;
result[i] = IntDecoder<isSigned>::safeLoadBits(
Expand Down
1 change: 1 addition & 0 deletions velox/dwio/common/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ add_subdirectory(utils)
add_executable(
velox_dwio_common_test
BitConcatenationTest.cpp
DecodeBitsTest.cpp
ChainedBufferTests.cpp
DataBufferTests.cpp
DecoderUtilTest.cpp
Expand Down
Loading

0 comments on commit eef5683

Please sign in to comment.