diff --git a/.circleci/config.yml b/.circleci/config.yml
index 00dd0c688955..597a66085dd5 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -80,7 +80,8 @@ executors:
 jobs:
   macos-build:
     macos:
-      xcode: "11.7.0"
+      xcode: "12.4.0"
+    resource_class: large
     steps:
       - checkout
       - run:
diff --git a/scripts/setup-helper-functions.sh b/scripts/setup-helper-functions.sh
index 9faa82bff8d1..43120d9d6494 100644
--- a/scripts/setup-helper-functions.sh
+++ b/scripts/setup-helper-functions.sh
@@ -100,7 +100,7 @@ function get_cxx_flags {
     ;;
 
     "avx")
-      echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt -std=c++17"
+      echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt -mbmi2 -g -std=c++17"
     ;;
 
     "sse")
diff --git a/velox/dwio/common/IntDecoder.cpp b/velox/dwio/common/IntDecoder.cpp
index 11feda6fc6ae..fc7f7db1abc7 100644
--- a/velox/dwio/common/IntDecoder.cpp
+++ b/velox/dwio/common/IntDecoder.cpp
@@ -2405,6 +2405,198 @@ template void IntDecoder<false>::bulkReadRows(
     int16_t* result,
     int32_t initialRow);
 
+#if XSIMD_WITH_AVX2
+// Bit unpacking using BMI2 and AVX2.
+typedef int32_t __m256si __attribute__((__vector_size__(32), __may_alias__));
+
+typedef int32_t __m256si_u
+    __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
+
+namespace {
+
+template <int8_t i>
+auto as4x64(__m256i x) {
+  return _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, i));
+}
+
+template <typename T>
+void store8Ints(__m256i eightInts, int32_t i, T* FOLLY_NONNULL result) {
+  if (sizeof(T) == 4) {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(result + i), eightInts);
+  } else {
+    _mm256_storeu_si256(
+        reinterpret_cast<__m256i*>(result + i), as4x64<0>(eightInts));
+    _mm256_storeu_si256(
+        reinterpret_cast<__m256i*>(result + i + 4), as4x64<1>(eightInts));
+  }
+}
+
+template <typename T>
+inline T* addBytes(T* pointer, int32_t bytes) {
+  return reinterpret_cast<T*>(reinterpret_cast<uint64_t>(pointer) + bytes);
+}
+
+template <typename T>
+inline __m256i as256i(T x) {
+  return reinterpret_cast<__m256i>(x);
+}
+
+template <typename T>
+inline __m256si as8x32(T x) {
+  return reinterpret_cast<__m256si>(x);
+}
+
+template <typename T>
+FOLLY_ALWAYS_INLINE __m256i gather8Sparse(
+    const uint64_t* bits,
+    int32_t bitOffset,
+    const int32_t* rows,
+    int32_t i,
+    __m256si masks,
+    uint8_t width,
+    T* result) {
+  constexpr __m256si kMultipliers = {256, 128, 64, 32, 16, 8, 4, 2};
+
+  auto indices =
+      *reinterpret_cast<const __m256si_u*>(rows + i) * width + bitOffset;
+  __m256si multipliers;
+  if (width % 8 != 0) {
+    multipliers = (__m256si)_mm256_permutevar8x32_epi32(
+        as256i(kMultipliers), as256i(indices & 7));
+  }
+  auto byteIndices = indices >> 3;
+  auto data = as8x32(_mm256_i32gather_epi32(
+      reinterpret_cast<const int*>(bits), as256i(byteIndices), 1));
+  if (width % 8 != 0) {
+    data = (data * multipliers) >> 8;
+  }
+  return as256i(data & masks);
+}
+
+template <uint8_t width>
+struct PdepMask {
+  static constexpr uint64_t kMask = bits::lowMask(width);
+  static constexpr uint64_t kMask2 = kMask | (kMask << 8);
+  static constexpr uint64_t kMask4 = kMask2 | (kMask2 << 16);
+  static constexpr uint64_t kDepMask8 = kMask4 | (kMask4 << 32);
+  static constexpr uint64_t kMask16 = kMask | (kMask << 16);
+  static constexpr uint64_t kDepMask16 = kMask16 | (kMask16 << 32);
+};
+
+uint64_t pdepMasks8[] = {
+    0,
+    PdepMask<1>::kDepMask8,
+    PdepMask<2>::kDepMask8,
+    PdepMask<3>::kDepMask8,
+    PdepMask<4>::kDepMask8,
+    PdepMask<5>::kDepMask8,
+    PdepMask<6>::kDepMask8,
+    PdepMask<7>::kDepMask8,
+    PdepMask<8>::kDepMask8};
+
+uint64_t pdepMasks16[] = {
+    PdepMask<9>::kDepMask16,
+    PdepMask<10>::kDepMask16,
+    PdepMask<11>::kDepMask16,
+    PdepMask<12>::kDepMask16,
+    PdepMask<13>::kDepMask16,
+    PdepMask<14>::kDepMask16,
+    PdepMask<15>::kDepMask16,
+    PdepMask<16>::kDepMask16};
+
+template <typename T>
+int32_t decode1To24(
+    const uint64_t* bits,
+    int32_t bitOffset,
+    const int* rows,
+    int32_t numRows,
+    uint8_t width,
+    T* result) {
+  const auto masks = as8x32(_mm256_set1_epi32(bits::lowMask(width)));
+  __m256i eightInts;
+  int i = 0;
+  if (width <= 8) {
+    uint64_t depMask = pdepMasks8[width];
+    for (; i + 8 <= numRows; i += 8) {
+      auto row = rows[i];
+      auto endRow = rows[i + 7];
+      if (endRow - row == 7) {
+        uint64_t eightBytes;
+        if (width == 8) {
+          if (!bitOffset) {
+            eightBytes = *addBytes(bits, row);
+          } else {
+            eightBytes =
+                bits::detail::loadBits<uint64_t>(bits, bitOffset + 8 * row, 64);
+          }
+        } else {
+          auto bit = row * width + bitOffset;
+          auto byte = bit >> 3;
+          auto shift = bit & 7;
+          uint64_t word = *addBytes(bits, byte) >> shift;
+          eightBytes = _pdep_u64(word, depMask);
+        }
+        eightInts = _mm256_cvtepu8_epi32(
+            _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&eightBytes)));
+      } else {
+        eightInts =
+            gather8Sparse(bits, bitOffset, rows, i, masks, width, result);
+      }
+      store8Ints(eightInts, i, result);
+    }
+  } else if (width <= 16) {
+    uint64_t depMask = pdepMasks16[width - 9];
+    for (; i + 8 <= numRows; i += 8) {
+      auto row = rows[i];
+      auto endRow = rows[i + 7];
+      if (endRow - row == 7) {
+        // Use pdep to shift 2 words of bit packed data with width
+        // 9-16. For widts <= 14 four bit packed fields can always be
+        // loaded with a single uint64_t load. For 15 and 16 bits this
+        // depends on the start bit position. For either case we fill
+        // an array of 2x64 bits and widen that to a 8x32 word.
+        uint64_t words[2];
+        if (width <= 14) {
+          auto bit = row * width + bitOffset;
+          auto byte = bit >> 3;
+          auto shift = bit & 7;
+          uint64_t word = *addBytes(bits, byte) >> shift;
+          words[0] = _pdep_u64(word, depMask);
+          bit += 4 * width;
+          byte = bit >> 3;
+          shift = bit & 7;
+          word = *addBytes(bits, byte) >> shift;
+          words[1] = _pdep_u64(word, depMask);
+        } else {
+          words[0] = bits::detail::loadBits<uint64_t>(
+              bits, bitOffset + width * row, 64);
+          words[1] = bits::detail::loadBits<uint64_t>(
+              bits, bitOffset + width * (row + 4), 64);
+          if (width == 15) {
+            words[0] = _pdep_u64(words[0], depMask);
+            words[1] = _pdep_u64(words[1], depMask);
+          }
+        }
+        eightInts = _mm256_cvtepu16_epi32(
+            _mm_load_si128(reinterpret_cast<const __m128i*>(&words)));
+      } else {
+        eightInts =
+            gather8Sparse(bits, bitOffset, rows, i, masks, width, result);
+      }
+      store8Ints(eightInts, i, result);
+    }
+  } else {
+    for (; i + 8 <= numRows; i += 8) {
+      auto row = rows[i];
+      eightInts = gather8Sparse(bits, bitOffset, rows, i, masks, width, result);
+      store8Ints(eightInts, i, result);
+    }
+  }
+  return i;
+}
+} // namespace
+#endif
+
 template <bool isSigned>
 template <typename T>
 // static
@@ -2417,10 +2609,19 @@ void IntDecoder<isSigned>::decodeBitsLE(
     const char* bufferEnd,
     T* FOLLY_NONNULL result) {
   uint64_t mask = bits::lowMask(bitWidth);
+  // We subtract rowBias * bitWidth bits from the starting position.
+  bitOffset -= rowBias * bitWidth;
+  if (bitOffset < 0) {
+    // Decrement the pointer by enough bytes to have a non-negative bitOffset.
+    auto bytes = bits::roundUp(-bitOffset, 8) / 8;
+    bitOffset += bytes * 8;
+    bits = reinterpret_cast<const uint64_t*>(
+        reinterpret_cast<const char*>(bits) - bytes);
+  }
   auto numRows = rows.size();
   if (bitWidth > 56) {
     for (auto i = 0; i < numRows; ++i) {
-      auto bit = bitOffset + (rows[i] - rowBias) * bitWidth;
+      auto bit = bitOffset + (rows[i]) * bitWidth;
       result[i] = bits::detail::loadBits<T>(bits, bit, bitWidth) & mask;
     }
     return;
@@ -2430,8 +2631,7 @@ void IntDecoder<isSigned>::decodeBitsLE(
   bool anyUnsafe = false;
   if (bufferEnd) {
     const char* endByte = reinterpret_cast<const char*>(bits) +
-        bits::roundUp(bitOffset + (rows.back() - rowBias + 1) * bitWidth, 8) /
-            8;
+        bits::roundUp(bitOffset + (rows.back() + 1) * bitWidth, 8) / 8;
     // redzone is the number of bytes at the end of the accessed range that
     // could overflow the buffer if accessed 64 its wide.
     int64_t redZone =
@@ -2448,8 +2648,18 @@ void IntDecoder<isSigned>::decodeBitsLE(
       }
     }
   }
-  for (auto i = 0; i < numSafeRows; ++i) {
-    auto bit = bitOffset + (rows[i] - rowBias) * bitWidth;
+  int32_t i = 0;
+
+#if XSIMD_WITH_AVX2
+  // Use AVX2 for specific widths.
+  if (bitWidth <= 24) {
+    i = decode1To24(
+        bits, bitOffset, rows.data(), numSafeRows, bitWidth, result);
+  }
+#endif
+
+  for (; i < numSafeRows; ++i) {
+    auto bit = bitOffset + (rows[i]) * bitWidth;
     auto byte = bit / 8;
     auto shift = bit & 7;
     result[i] = (*reinterpret_cast<const uint64_t*>(
@@ -2461,7 +2671,7 @@ void IntDecoder<isSigned>::decodeBitsLE(
     auto lastSafeWord = bufferEnd - sizeof(uint64_t);
     assert(lastSafeWord); // lint
     for (auto i = numSafeRows; i < numRows; ++i) {
-      auto bit = bitOffset + (rows[i] - rowBias) * bitWidth;
+      auto bit = bitOffset + (rows[i]) * bitWidth;
       auto byte = bit / 8;
       auto shift = bit & 7;
       result[i] = IntDecoder<isSigned>::safeLoadBits(
diff --git a/velox/dwio/common/tests/CMakeLists.txt b/velox/dwio/common/tests/CMakeLists.txt
index 26b51fa2c776..43cfcfa65911 100644
--- a/velox/dwio/common/tests/CMakeLists.txt
+++ b/velox/dwio/common/tests/CMakeLists.txt
@@ -17,6 +17,7 @@ add_subdirectory(utils)
 add_executable(
   velox_dwio_common_test
   BitConcatenationTest.cpp
+  DecodeBitsTest.cpp
   ChainedBufferTests.cpp
   DataBufferTests.cpp
   DecoderUtilTest.cpp
diff --git a/velox/dwio/common/tests/DecodeBitsTest.cpp b/velox/dwio/common/tests/DecodeBitsTest.cpp
new file mode 100644
index 000000000000..dc7a87b0acf9
--- /dev/null
+++ b/velox/dwio/common/tests/DecodeBitsTest.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/Random.h>
+#include "velox/common/base/Nulls.h"
+#include "velox/dwio/common/IntDecoder.h"
+
+#include <gtest/gtest.h>
+
+using namespace facebook::velox::dwio::common;
+using namespace facebook::velox;
+
+class DecodeBitsTest : public testing::Test {
+ protected:
+  void SetUp() {
+    for (int32_t i = 0; i < 100000; i++) {
+      auto randomInt = folly::Random::rand64();
+      randomInts_.push_back(randomInt);
+    }
+    populateBitPackedData();
+    allRowNumbers_.resize(randomInts_.size());
+    std::iota(allRowNumbers_.begin(), allRowNumbers_.end(), 0);
+    oddRowNumbers_.resize(randomInts_.size() / 2);
+    for (auto i = 0; i < oddRowNumbers_.size(); i++) {
+      oddRowNumbers_[i] = i * 2 + 1;
+    }
+    allRows_ = RowSet(allRowNumbers_);
+    oddRows_ = RowSet(oddRowNumbers_);
+  }
+
+  void populateBitPackedData() {
+    bitPackedData_.resize(32);
+    for (auto bitWidth = 1; bitWidth < 32; ++bitWidth) {
+      auto numWords = bits::roundUp(randomInts_.size() * bitWidth, 64) / 64;
+      bitPackedData_[bitWidth].resize(numWords);
+      auto source = randomInts_.data();
+      auto destination =
+          reinterpret_cast<uint64_t*>(bitPackedData_[bitWidth].data());
+      for (auto i = 0; i < randomInts_.size(); ++i) {
+        bits::copyBits(
+            source,
+            i * sizeof(*source) * 8,
+            destination,
+            i * bitWidth,
+            bitWidth);
+      }
+    }
+  }
+
+  template <typename T, typename U>
+  void checkDecodeResult(
+      const T* reference,
+      RowSet rows,
+      int8_t bitWidth,
+      const U* result) {
+    uint64_t mask = bits::lowMask(bitWidth);
+    for (auto i = 0; i < rows.size(); ++i) {
+      uint64_t original = reference[rows[i]] & mask;
+      ASSERT_EQ(original, result[i]) << " at " << i;
+    }
+  }
+
+  template <typename T>
+  void testDecodeRows(uint8_t width, RowSet rows) {
+    std::vector<T> result(rows.size());
+    int32_t start = 0;
+
+    int32_t batch = 1;
+    // Read the encoding in progressively larger batches, each time 3x more than
+    // previous.
+    auto bits = bitPackedData_[width].data();
+    do {
+      auto row = rows[start];
+      int32_t bit = row * width;
+      auto byteOffset = bit / 8;
+      auto bitOffset = bit & 7;
+      auto numRows = std::min<int32_t>(start + batch, rows.size()) - start;
+      auto bitsPointer = reinterpret_cast<const uint64_t*>(
+          reinterpret_cast<const char*>(bits) + byteOffset);
+
+      // end is the first unaddressable address after the bit packed data. We
+      // set this to be the byte of the last bit field to exercise the safe
+      // path.
+      auto end = reinterpret_cast<const char*>(bitsPointer) +
+          (((start + rows[numRows - 1] - rows[start]) * width) / 8);
+      IntDecoder<false>::decodeBitsLE(
+          bitsPointer,
+          bitOffset,
+          RowSet(&rows[start], numRows),
+          rows[start],
+          width,
+          end,
+          result.data() + start);
+      start += batch;
+      batch *= 3;
+    } while (start < rows.size());
+    checkDecodeResult(randomInts_.data(), rows, width, result.data());
+  }
+
+  std::vector<uint64_t> randomInts_;
+
+  // All indices into 'randomInts_'.
+  std::vector<int32_t> allRowNumbers_;
+
+  // Indices into odd positions in 'randomInts_'.
+  std::vector<int32_t> oddRowNumbers_;
+
+  // Array of bit packed representations of randomInts_. The array at index i
+  // is packed i bits wide and the values come from the low bits of
+  std::vector<std::vector<uint64_t>> bitPackedData_;
+  RowSet allRows_;
+  RowSet oddRows_;
+};
+
+TEST_F(DecodeBitsTest, allWidths) {
+  for (auto width = 1; width < bitPackedData_.size(); ++width) {
+    testDecodeRows<int32_t>(width, allRows_);
+    testDecodeRows<int64_t>(width, allRows_);
+    testDecodeRows<int32_t>(width, oddRows_);
+    testDecodeRows<int64_t>(width, oddRows_);
+  }
+}
diff --git a/velox/dwio/common/tests/IntDecoderBenchmark.cpp b/velox/dwio/common/tests/IntDecoderBenchmark.cpp
index fe969f1292e1..f636dc58bd10 100644
--- a/velox/dwio/common/tests/IntDecoderBenchmark.cpp
+++ b/velox/dwio/common/tests/IntDecoderBenchmark.cpp
@@ -23,9 +23,12 @@
 #include "folly/lang/Bits.h"
 #include "velox/common/base/BitUtil.h"
 #include "velox/dwio/common/IntCodecCommon.h"
+#include "velox/dwio/common/IntDecoder.h"
 #include "velox/dwio/common/exception/Exception.h"
 
+using namespace facebook::velox;
 using namespace facebook::velox::dwio;
+using namespace facebook::velox::dwio::common;
 namespace bits = facebook::velox::bits;
 
 const size_t kNumElements = 1000000;
@@ -45,6 +48,17 @@ std::vector<uint64_t> randomInts_u64;
 std::vector<uint64_t> randomInts_u64_result;
 std::vector<char> buffer_u64;
 
+// Array of bit packed representations of randomInts_u32. The array at index i
+// is packed i bits wide and the values come from the low bits of
+std::vector<std::vector<uint64_t>> bitPackedData;
+
+std::vector<uint32_t> result32;
+
+std::vector<int32_t> allRowNumbers;
+std::vector<int32_t> oddRowNumbers;
+RowSet allRows;
+RowSet oddRows;
+
 uint64_t readVuLong(const char* buffer, size_t& len) {
   if (LIKELY(len >= folly::kMaxVarintLength64)) {
     const char* p = buffer;
@@ -935,6 +949,144 @@ BENCHMARK_RELATIVE(decodeNew_64) {
       randomInts_u64.size(), buffer_u64.data(), randomInts_u64_result.data());
 }
 
+// Naive unpacking, original version of IntDecoder::decodeBitsLE.
+template <typename T>
+void naiveDecodeBitsLE(
+    const uint64_t* FOLLY_NONNULL bits,
+    int32_t bitOffset,
+    RowSet rows,
+    int32_t rowBias,
+    uint8_t bitWidth,
+    const char* bufferEnd,
+    T* FOLLY_NONNULL result) {
+  uint64_t mask = bits::lowMask(bitWidth);
+  auto numRows = rows.size();
+  if (bitWidth > 56) {
+    for (auto i = 0; i < numRows; ++i) {
+      auto bit = bitOffset + (rows[i] - rowBias) * bitWidth;
+      result[i] = bits::detail::loadBits<T>(bits, bit, bitWidth) & mask;
+    }
+    return;
+  }
+  auto FOLLY_NONNULL lastSafe = bufferEnd - sizeof(uint64_t);
+  int32_t numSafeRows = numRows;
+  bool anyUnsafe = false;
+  if (bufferEnd) {
+    const char* endByte = reinterpret_cast<const char*>(bits) +
+        bits::roundUp(bitOffset + (rows.back() - rowBias + 1) * bitWidth, 8) /
+            8;
+    // redzone is the number of bytes at the end of the accessed range that
+    // could overflow the buffer if accessed 64 its wide.
+    int64_t redZone =
+        sizeof(uint64_t) - static_cast<int64_t>(bufferEnd - endByte);
+    if (redZone > 0) {
+      anyUnsafe = true;
+      auto numRed = (redZone + 1) * 8 / bitWidth;
+      int32_t lastSafeIndex = rows.back() - numRed;
+      --numSafeRows;
+      for (; numSafeRows >= 1; --numSafeRows) {
+        if (rows[numSafeRows - 1] < lastSafeIndex) {
+          break;
+        }
+      }
+    }
+  }
+  for (auto i = 0; i < numSafeRows; ++i) {
+    auto bit = bitOffset + (rows[i] - rowBias) * bitWidth;
+    auto byte = bit / 8;
+    auto shift = bit & 7;
+    result[i] = (*reinterpret_cast<const uint64_t*>(
+                     reinterpret_cast<const char*>(bits) + byte) >>
+                 shift) &
+        mask;
+  }
+  if (anyUnsafe) {
+    auto lastSafeWord = bufferEnd - sizeof(uint64_t);
+    assert(lastSafeWord); // lint
+    for (auto i = numSafeRows; i < numRows; ++i) {
+      auto bit = bitOffset + (rows[i] - rowBias) * bitWidth;
+      auto byte = bit / 8;
+      auto shift = bit & 7;
+      result[i] = IntDecoder<false>::safeLoadBits(
+                      reinterpret_cast<const char*>(bits) + byte,
+                      shift,
+                      bitWidth,
+                      lastSafeWord) &
+          mask;
+    }
+  }
+}
+
+template <typename T>
+void unpackNaive(RowSet rows, uint8_t bitWidth, T* result) {
+  auto data = bitPackedData[bitWidth].data();
+  auto numBytes = bits::roundUp((rows.back() + 1) * bitWidth, 8) / 8;
+  auto end = reinterpret_cast<const char*>(data) + numBytes;
+  naiveDecodeBitsLE(data, 0, rows, 0, bitWidth, end, result32.data());
+}
+
+template <typename T>
+void unpackFast(RowSet rows, uint8_t bitWidth, T* result) {
+  auto data = bitPackedData[bitWidth].data();
+  auto numBytes = bits::roundUp((rows.back() + 1) * bitWidth, 8) / 8;
+  auto end = reinterpret_cast<const char*>(data) + numBytes;
+  IntDecoder<false>::decodeBitsLE(
+      data,
+      0,
+      rows,
+      0,
+      bitWidth,
+      end,
+      reinterpret_cast<int32_t*>(result32.data()));
+}
+
+#define BIT_BM_CASE_32(width)                       \
+  BENCHMARK(unpackNaive##width##_32) {              \
+    unpackNaive(allRows, width, result32.data());   \
+  }                                                 \
+                                                    \
+  BENCHMARK_RELATIVE(unpackFast##width##_32) {      \
+    unpackFast(allRows, width, result32.data());    \
+  }                                                 \
+                                                    \
+  BENCHMARK_RELATIVE(unpackNaive##width##_32_odd) { \
+    unpackNaive(oddRows, width, result32.data());   \
+  }                                                 \
+                                                    \
+  BENCHMARK_RELATIVE(unpackFast##width##_32_odd) {  \
+    unpackFast(oddRows, 7, result32.data());        \
+  }
+
+BIT_BM_CASE_32(7)
+BIT_BM_CASE_32(8)
+BIT_BM_CASE_32(13)
+BIT_BM_CASE_32(16)
+BIT_BM_CASE_32(22)
+BIT_BM_CASE_32(24)
+BIT_BM_CASE_32(31)
+
+void populateBitPacked() {
+  bitPackedData.resize(32);
+  for (auto bitWidth = 2; bitWidth < 32; ++bitWidth) {
+    auto numWords = bits::roundUp(randomInts_u32.size() * bitWidth, 64) / 64;
+    bitPackedData[bitWidth].resize(numWords);
+    auto source = reinterpret_cast<uint64_t*>(randomInts_u32.data());
+    auto destination =
+        reinterpret_cast<uint64_t*>(bitPackedData[bitWidth].data());
+    for (auto i = 0; i < randomInts_u32.size(); ++i) {
+      bits::copyBits(source, i * 32, destination, i * bitWidth, bitWidth);
+    }
+  }
+  allRowNumbers.resize(randomInts_u32.size());
+  std::iota(allRowNumbers.begin(), allRowNumbers.end(), 0);
+  oddRowNumbers.resize(randomInts_u32.size() / 2);
+  for (auto i = 0; i < oddRowNumbers.size(); i++) {
+    oddRowNumbers[i] = i * 2 + 1;
+  }
+  allRows = RowSet(allRowNumbers);
+  oddRows = RowSet(oddRowNumbers);
+}
+
 int32_t main(int32_t argc, char* argv[]) {
   folly::init(&argc, &argv);
 
@@ -960,7 +1112,7 @@ int32_t main(int32_t argc, char* argv[]) {
   randomInts_u32_result.resize(randomInts_u32.size());
   len_u32 = pos;
 
-  // Populate uint32 buffer
+  // Populate uint64 buffer
   buffer_u64.resize(kNumElements);
   pos = 0;
   for (int32_t i = 0; i < 100000; i++) {
@@ -968,6 +1120,9 @@ int32_t main(int32_t argc, char* argv[]) {
     randomInts_u64.push_back(randomInt);
     pos = writeVulongToBuffer(randomInt, buffer_u64.data(), pos);
   }
+  populateBitPacked();
+  result32.resize(randomInts_u32.size());
+
   randomInts_u64_result.resize(randomInts_u64.size());
   len_u64 = pos;