diff --git a/benchmarks/random_allocations/random_allocations.cpp b/benchmarks/random_allocations/random_allocations.cpp
index 828561dd1..470442830 100644
--- a/benchmarks/random_allocations/random_allocations.cpp
+++ b/benchmarks/random_allocations/random_allocations.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -170,7 +170,9 @@ inline auto make_pool()
 
 inline auto make_arena()
 {
-  return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(make_cuda());
+  auto free = rmm::detail::available_device_memory().first;
+  constexpr auto reserve{64UL << 20};  // Leave some space for CUDA overhead.
+  return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(make_cuda(), free - reserve);
 }
 
 inline auto make_binning()
diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index b6f15f36f..f1b4e40c4 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -78,26 +78,21 @@ class arena_memory_resource final : public device_memory_resource {
    * @brief Construct an `arena_memory_resource`.
    *
    * @throws rmm::logic_error if `upstream_mr == nullptr`.
-   * @throws rmm::logic_error if `initial_size` is neither the default nor aligned to a multiple of
-   * 256 bytes.
-   * @throws rmm::logic_error if `maximum_size` is neither the default nor aligned to a multiple of
-   * 256 bytes.
    *
-   * @param upstream_mr The memory resource from which to allocate blocks for the pool
-   * @param initial_size Minimum size, in bytes, of the initial global arena. Defaults to half of
-   * the available memory on the current device.
-   * @param maximum_size Maximum size, in bytes, that the global arena can grow to. Defaults to all
-   * of the available memory on the current device.
+   * @param upstream_mr The memory resource from which to allocate blocks for the global arena.
+   * @param arena_size Size in bytes of the global arena. Defaults to half of the available memory
+   * on the current device.
+   * @param dump_log_on_failure If true, dump memory log when running out of memory.
    */
   explicit arena_memory_resource(Upstream* upstream_mr,
-                                 std::size_t initial_size = global_arena::default_initial_size,
-                                 std::size_t maximum_size = global_arena::default_maximum_size,
-                                 bool dump_log_on_failure = false)
-    : global_arena_{upstream_mr, initial_size, maximum_size},
-      dump_log_on_failure_{dump_log_on_failure}
+                                 std::optional<std::size_t> arena_size = std::nullopt,
+                                 bool dump_log_on_failure              = false)
+    : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
   {
     if (dump_log_on_failure_) {
       logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log");
+      // Set the level to `debug` for more detailed output.
+      logger_->set_level(spdlog::level::info);
     }
   }
 
@@ -125,17 +120,15 @@ class arena_memory_resource final : public device_memory_resource {
   bool supports_get_mem_info() const noexcept override { return false; }
 
  private:
-  using global_arena = detail::arena::global_arena<Upstream>;
-  using arena        = detail::arena::arena<Upstream>;
-  using read_lock    = std::shared_lock<std::shared_timed_mutex>;
-  using write_lock   = std::lock_guard<std::shared_timed_mutex>;
+  using global_arena = rmm::mr::detail::arena::global_arena<Upstream>;
+  using arena        = rmm::mr::detail::arena::arena<Upstream>;
 
   /**
    * @brief Allocates memory of size at least `bytes`.
    *
    * The returned pointer has at least 256-byte alignment.
    *
-   * @throws `std::bad_alloc` if the requested allocation could not be fulfilled.
+   * @throws `rmm::out_of_memory` if no more memory is available for the requested size.
    *
    * @param bytes The size in bytes of the allocation.
    * @param stream The stream to associate this allocation with.
@@ -144,22 +137,43 @@ class arena_memory_resource final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
   {
     if (bytes <= 0) { return nullptr; }
+#ifdef RMM_ARENA_USE_SIZE_CLASSES
+    bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
+#else
+    bytes = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
+#endif
+    auto& arena = get_arena(stream);
 
-    bytes         = detail::arena::align_up(bytes);
-    auto& arena   = get_arena(stream);
-    void* pointer = arena.allocate(bytes);
+    {
+      std::shared_lock lock(mtx_);
+      void* pointer = arena.allocate(bytes);
+      if (pointer != nullptr) { return pointer; }
+    }
 
-    if (pointer == nullptr) {
-      write_lock lock(mtx_);
+    {
+      std::unique_lock lock(mtx_);
       defragment();
-      pointer = arena.allocate(bytes);
+      void* pointer = arena.allocate(bytes);
       if (pointer == nullptr) {
         if (dump_log_on_failure_) { dump_memory_log(bytes); }
         RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
       }
+      return pointer;
     }
+  }
 
-    return pointer;
+  /**
+   * @brief Defragment memory by returning all superblocks to the global arena.
+   */
+  void defragment()
+  {
+    RMM_CUDA_TRY(cudaDeviceSynchronize());
+    for (auto& thread_arena : thread_arenas_) {
+      thread_arena.second->clean();
+    }
+    for (auto& stream_arena : stream_arenas_) {
+      stream_arena.second.clean();
+    }
   }
 
   /**
@@ -167,29 +181,56 @@ class arena_memory_resource final : public device_memory_resource {
    *
    * @param ptr Pointer to be deallocated.
    * @param bytes The size in bytes of the allocation. This must be equal to the
-   * value of `bytes` that was passed to the `allocate` call that returned `p`.
+   * value of `bytes` that was passed to the `allocate` call that returned `ptr`.
    * @param stream Stream on which to perform deallocation.
    */
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
   {
     if (ptr == nullptr || bytes <= 0) { return; }
+#ifdef RMM_ARENA_USE_SIZE_CLASSES
+    bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
+#else
+    bytes = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
+#endif
+    auto& arena = get_arena(stream);
+
+    {
+      std::shared_lock lock(mtx_);
+      // If the memory being freed does not belong to the arena, the following will return false.
+      if (arena.deallocate(ptr, bytes, stream)) { return; }
+    }
+
+    {
+      // Since we are returning this memory to another stream, we need to make sure the current
+      // stream is caught up.
+      stream.synchronize_no_throw();
 
-    bytes = detail::arena::align_up(bytes);
-    get_arena(stream).deallocate(ptr, bytes, stream);
+      std::unique_lock lock(mtx_);
+      deallocate_from_other_arena(ptr, bytes, stream);
+    }
   }
 
   /**
-   * @brief Defragment memory by returning all free blocks to the global arena.
+   * @brief Deallocate memory pointed to by `ptr` that was allocated in a different arena.
+   *
+   * @param ptr Pointer to be deallocated.
+   * @param bytes The size in bytes of the allocation. This must be equal to the
+   * value of `bytes` that was passed to the `allocate` call that returned `ptr`.
+   * @param stream Stream on which to perform deallocation.
    */
-  void defragment()
+  void deallocate_from_other_arena(void* ptr, std::size_t bytes, cuda_stream_view stream)
   {
-    RMM_CUDA_TRY(cudaDeviceSynchronize());
-    for (auto& thread_arena : thread_arenas_) {
-      thread_arena.second->clean();
-    }
-    for (auto& stream_arena : stream_arenas_) {
-      stream_arena.second.clean();
+    if (use_per_thread_arena(stream)) {
+      for (auto const& thread_arena : thread_arenas_) {
+        if (thread_arena.second->deallocate(ptr, bytes)) { return; }
+      }
+    } else {
+      for (auto& stream_arena : stream_arenas_) {
+        if (stream_arena.second.deallocate(ptr, bytes)) { return; }
+      }
     }
+
+    if (!global_arena_.deallocate(ptr, bytes)) { RMM_FAIL("allocation not found"); }
   }
 
   /**
@@ -213,12 +254,12 @@ class arena_memory_resource final : public device_memory_resource {
   {
     auto const thread_id = std::this_thread::get_id();
     {
-      read_lock lock(mtx_);
+      std::shared_lock lock(map_mtx_);
       auto const iter = thread_arenas_.find(thread_id);
       if (iter != thread_arenas_.end()) { return *iter->second; }
     }
     {
-      write_lock lock(mtx_);
+      std::unique_lock lock(map_mtx_);
       auto thread_arena = std::make_shared<arena>(global_arena_);
       thread_arenas_.emplace(thread_id, thread_arena);
       thread_local detail::arena::arena_cleaner<Upstream> cleaner{thread_arena};
@@ -235,12 +276,12 @@ class arena_memory_resource final : public device_memory_resource {
   {
     RMM_LOGGING_ASSERT(!use_per_thread_arena(stream));
     {
-      read_lock lock(mtx_);
+      std::shared_lock lock(map_mtx_);
       auto const iter = stream_arenas_.find(stream.value());
       if (iter != stream_arenas_.end()) { return iter->second; }
     }
     {
-      write_lock lock(mtx_);
+      std::unique_lock lock(map_mtx_);
       stream_arenas_.emplace(stream.value(), global_arena_);
       return stream_arenas_.at(stream.value());
     }
@@ -269,18 +310,6 @@ class arena_memory_resource final : public device_memory_resource {
     logger_->info("**************************************************");
     logger_->info("Global arena:");
     global_arena_.dump_memory_log(logger_);
-    logger_->info("Per-thread arenas:");
-    for (auto const& thread_arena : thread_arenas_) {
-      logger_->info("  Thread {}:", thread_arena.first);
-      thread_arena.second->dump_memory_log(logger_);
-    }
-    if (!stream_arenas_.empty()) {
-      logger_->info("Per-stream arenas:");
-      for (auto const& stream_arena : stream_arenas_) {
-        logger_->info("  Stream {}:", static_cast<void*>(stream_arena.first));
-        stream_arena.second.dump_memory_log(logger_);
-      }
-    }
     logger_->flush();
   }
 
@@ -304,11 +333,13 @@ class arena_memory_resource final : public device_memory_resource {
   /// Implementation note: for small sizes, map is more efficient than unordered_map.
   std::map<cudaStream_t, arena> stream_arenas_;
   /// If true, dump memory information to log on allocation failure.
-  bool dump_log_on_failure_;
+  bool dump_log_on_failure_{};
   /// The logger for memory dump.
   std::shared_ptr<spdlog::logger> logger_{};
-  /// Mutex for read and write locks.
-  mutable std::shared_timed_mutex mtx_;
+  /// Mutex for read and write locks on arena maps.
+  mutable std::shared_mutex map_mtx_;
+  /// Mutex for shared and unique locks on the mr.
+  mutable std::shared_mutex mtx_;
 };
 
 }  // namespace rmm::mr
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index cda5a73ad..c0e5df377 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/aligned.hpp>
+#include <rmm/detail/cuda_util.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/logger.hpp>
 
@@ -32,216 +33,455 @@
 #include <memory>
 #include <mutex>
 #include <numeric>
+#include <optional>
 #include <set>
-#include <unordered_map>
 
 namespace rmm::mr::detail::arena {
 
-/// Minimum size of a superblock (256 KiB).
-constexpr std::size_t minimum_superblock_size = 1U << 18U;
-
 /**
- * @brief Represents a chunk of memory that can be allocated and deallocated.
+ * @brief Align up to nearest size class.
  *
- * A block bigger than a certain size is called a "superblock".
+ * @param[in] value value to align.
+ * @return Return the aligned value.
  */
-class block {
- public:
-  /**
-   * @brief Construct a default block.
-   */
-  block() = default;
+inline std::size_t align_to_size_class(std::size_t value) noexcept
+{
+  // See http://jemalloc.net/jemalloc.3.html.
+  // NOLINTBEGIN(readability-magic-numbers,cppcoreguidelines-avoid-magic-numbers)
+  static std::array<std::size_t, 117> size_classes{
+    // clang-format off
+    // Spacing 256:
+    256UL, 512UL, 768UL, 1024UL, 1280UL, 1536UL, 1792UL, 2048UL,
+    // Spacing 512:
+    2560UL, 3072UL, 3584UL, 4096UL,
+    // Spacing 1 KiB:
+    5UL << 10, 6UL << 10, 7UL << 10, 8UL << 10,
+    // Spacing 2 KiB:
+    10UL << 10, 12UL << 10, 14UL << 10, 16UL << 10,
+    // Spacing 4 KiB:
+    20UL << 10, 24UL << 10, 28UL << 10, 32UL << 10,
+    // Spacing 8 KiB:
+    40UL << 10, 48UL << 10, 54UL << 10, 64UL << 10,
+    // Spacing 16 KiB:
+    80UL << 10, 96UL << 10, 112UL << 10, 128UL << 10,
+    // Spacing 32 KiB:
+    160UL << 10, 192UL << 10, 224UL << 10, 256UL << 10,
+    // Spacing 64 KiB:
+    320UL << 10, 384UL << 10, 448UL << 10, 512UL << 10,
+    // Spacing 128 KiB:
+    640UL << 10, 768UL << 10, 896UL << 10, 1UL << 20,
+    // Spacing 256 KiB:
+    1280UL << 10, 1536UL << 10, 1792UL << 10, 2UL << 20,
+    // Spacing 512 KiB:
+    2560UL << 10, 3UL << 20, 3584UL << 10, 4UL << 20,
+    // Spacing 1 MiB:
+    5UL << 20, 6UL << 20, 7UL << 20, 8UL << 20,
+    // Spacing 2 MiB:
+    10UL << 20, 12UL << 20, 14UL << 20, 16UL << 20,
+    // Spacing 4 MiB:
+    20UL << 20, 24UL << 20, 28UL << 20, 32UL << 20,
+    // Spacing 8 MiB:
+    40UL << 20, 48UL << 20, 56UL << 20, 64UL << 20,
+    // Spacing 16 MiB:
+    80UL << 20, 96UL << 20, 112UL << 20, 128UL << 20,
+    // Spacing 32 MiB:
+    160UL << 20, 192UL << 20, 224UL << 20, 256UL << 20,
+    // Spacing 64 MiB:
+    320UL << 20, 384UL << 20, 448UL << 20, 512UL << 20,
+    // Spacing 128 MiB:
+    640UL << 20, 768UL << 20, 896UL << 20, 1UL << 30,
+    // Spacing 256 MiB:
+    1280UL << 20, 1536UL << 20, 1792UL << 20, 2UL << 30,
+    // Spacing 512 MiB:
+    2560UL << 20, 3UL << 30, 3584UL << 20, 4UL << 30,
+    // Spacing 1 GiB:
+    5UL << 30, 6UL << 30, 7UL << 30, 8UL << 30,
+    // Spacing 2 GiB:
+    10UL << 30, 12UL << 30, 14UL << 30, 16UL << 30,
+    // Spacing 4 GiB:
+    20UL << 30, 24UL << 30, 28UL << 30, 32UL << 30,
+    // Spacing 8 GiB:
+    40UL << 30, 48UL << 30, 56UL << 30, 64UL << 30,
+    // Spacing 16 GiB:
+    80UL << 30, 96UL << 30, 112UL << 30, 128UL << 30,
+    // Spacing 32 Gib:
+    160UL << 30, 192UL << 30, 224UL << 30, 256UL << 30,
+    // Catch all:
+    std::numeric_limits<std::size_t>::max()
+    // clang-format on
+  };
+  // NOLINTEND(readability-magic-numbers,cppcoreguidelines-avoid-magic-numbers)
+
+  auto* bound = std::lower_bound(size_classes.begin(), size_classes.end(), value);
+  RMM_LOGGING_ASSERT(bound != size_classes.end());
+  return *bound;
+}
 
+/**
+ * @brief Represents a contiguous region of memory.
+ */
+class byte_span {
+ public:
   /**
-   * @brief Construct a block given a pointer and size.
-   *
-   * @param pointer The address for the beginning of the block.
-   * @param size The size of the block.
+   * @brief Construct a default span.
    */
-  block(char* pointer, std::size_t size) : pointer_(pointer), size_(size) {}
+  byte_span() = default;
 
   /**
-   * @brief Construct a block given a void pointer and size.
+   * @brief Construct a span given a pointer and size.
    *
-   * @param pointer The address for the beginning of the block.
-   * @param size The size of the block.
+   * @param pointer The address for the beginning of the span.
+   * @param size The size of the span.
    */
-  block(void* pointer, std::size_t size) : pointer_(static_cast<char*>(pointer)), size_(size) {}
+  byte_span(void* pointer, std::size_t size) : pointer_{static_cast<char*>(pointer)}, size_{size}
+  {
+    RMM_LOGGING_ASSERT(pointer != nullptr);
+    RMM_LOGGING_ASSERT(size > 0);
+  }
 
   /// Returns the underlying pointer.
-  [[nodiscard]] void* pointer() const { return pointer_; }
+  [[nodiscard]] char* pointer() const { return pointer_; }
 
-  /// Returns the size of the block.
+  /// Returns the size of the span.
   [[nodiscard]] std::size_t size() const { return size_; }
 
-  /// Returns true if this block is valid (non-null), false otherwise.
-  [[nodiscard]] bool is_valid() const { return pointer_ != nullptr; }
+  /// Returns the end of the span.
+  [[nodiscard]] char* end() const
+  {
+    return pointer_ + size_;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+  }
+
+  /// Returns true if this span is valid (non-null), false otherwise.
+  [[nodiscard]] bool is_valid() const { return pointer_ != nullptr && size_ > 0; }
+
+  /// Used by std::set to compare spans.
+  bool operator<(byte_span const& span) const
+  {
+    RMM_LOGGING_ASSERT(span.is_valid());
+    return pointer_ < span.pointer_;
+  }
 
-  /// Returns true if this block is a superblock, false otherwise.
-  [[nodiscard]] bool is_superblock() const { return size_ >= minimum_superblock_size; }
+ private:
+  char* pointer_{};     ///< Raw memory pointer.
+  std::size_t size_{};  ///< Size in bytes.
+};
+
+/// Calculate the total size of a set of spans.
+template <typename T>
+inline auto total_memory_size(std::set<T> const& spans)
+{
+  return std::accumulate(
+    spans.cbegin(), spans.cend(), std::size_t{}, [](auto const& lhs, auto const& rhs) {
+      return lhs + rhs.size();
+    });
+}
+
+/**
+ * @brief Represents a chunk of memory that can be allocated and deallocated.
+ */
+class block final : public byte_span {
+ public:
+  using byte_span::byte_span;
 
   /**
-   * @brief Verifies whether this block can be merged to the beginning of block b.
+   * @brief Is this block large enough to fit `bytes` bytes?
    *
-   * @param b The block to check for contiguity.
-   * @return true Returns true if this block's `pointer` + `size` == `b.ptr`, and `not b.is_head`,
-                  false otherwise.
+   * @param bytes The size in bytes to check for fit.
+   * @return true if this block is at least `bytes` bytes.
    */
-  [[nodiscard]] bool is_contiguous_before(block const& blk) const
+  [[nodiscard]] bool fits(std::size_t bytes) const
   {
-    // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-    return pointer_ + size_ == blk.pointer_;
+    RMM_LOGGING_ASSERT(is_valid());
+    RMM_LOGGING_ASSERT(bytes > 0);
+    return size() >= bytes;
   }
 
   /**
-   * @brief Is this block large enough to fit `sz` bytes?
+   * @brief Verifies whether this block can be merged to the beginning of block blk.
    *
-   * @param size The size in bytes to check for fit.
-   * @return true if this block is at least `sz` bytes.
+   * @param blk The block to check for contiguity.
+   * @return true Returns true if this block's `pointer` + `size` == `blk.pointer`.
    */
-  [[nodiscard]] bool fits(std::size_t size) const { return size_ >= size; }
+  [[nodiscard]] bool is_contiguous_before(block const& blk) const
+  {
+    RMM_LOGGING_ASSERT(is_valid());
+    RMM_LOGGING_ASSERT(blk.is_valid());
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+    return pointer() + size() == blk.pointer();
+  }
 
   /**
    * @brief Split this block into two by the given size.
    *
-   * @param size The size in bytes of the first block.
-   * @return std::pair<block, block> A pair of blocks split by sz.
+   * @param bytes The size in bytes of the first block.
+   * @return std::pair<block, block> A pair of blocks split by bytes.
    */
-  [[nodiscard]] std::pair<block, block> split(std::size_t size) const
+  [[nodiscard]] std::pair<block, block> split(std::size_t bytes) const
   {
-    RMM_LOGGING_ASSERT(size_ >= size);
+    RMM_LOGGING_ASSERT(is_valid());
+    RMM_LOGGING_ASSERT(size() > bytes);
     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-    if (size_ > size) { return {{pointer_, size}, {pointer_ + size, size_ - size}}; }
-    return {*this, {}};
+    return {{pointer(), bytes}, {pointer() + bytes, size() - bytes}};
   }
 
   /**
    * @brief Coalesce two contiguous blocks into one.
    *
-   * `this->is_contiguous_before(b)` must be true.
+   * `this->is_contiguous_before(blk)` must be true.
    *
-   * @param b block to merge.
+   * @param blk block to merge.
    * @return block The merged block.
    */
   [[nodiscard]] block merge(block const& blk) const
   {
     RMM_LOGGING_ASSERT(is_contiguous_before(blk));
-    return {pointer_, size_ + blk.size_};
+    return {pointer(), size() + blk.size()};
   }
-
-  /// Used by std::set to compare blocks.
-  bool operator<(block const& blk) const { return pointer_ < blk.pointer_; }
-
- private:
-  char* pointer_{};     ///< Raw memory pointer.
-  std::size_t size_{};  ///< Size in bytes.
 };
 
-inline bool block_size_compare(block lhs, block rhs) { return lhs.size() < rhs.size(); }
-
-/**
- * @brief Align up to the allocation alignment.
- *
- * @param[in] v value to align
- * @return Return the aligned value
- */
-constexpr std::size_t align_up(std::size_t value) noexcept
+/// Comparison function for block sizes.
+inline bool block_size_compare(block const& lhs, block const& rhs)
 {
-  return rmm::detail::align_up(value, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
+  RMM_LOGGING_ASSERT(lhs.is_valid());
+  RMM_LOGGING_ASSERT(rhs.is_valid());
+  return lhs.size() < rhs.size();
 }
 
 /**
- * @brief Align down to the allocation alignment.
- *
- * @param[in] v value to align
- * @return Return the aligned value
+ * @brief Represents a large chunk of memory that is exchanged between the global arena and
+ * per-thread arenas.
  */
-constexpr std::size_t align_down(std::size_t value) noexcept
-{
-  return rmm::detail::align_down(value, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
-}
+class superblock final : public byte_span {
+ public:
+  /// Minimum size of a superblock (1 MiB).
+  static constexpr std::size_t minimum_size{1UL << 20};
+  /// Maximum size of a superblock (1 TiB), as a sanity check.
+  static constexpr std::size_t maximum_size{1UL << 40};
 
-/**
- * @brief Get the first free block of at least `size` bytes.
- *
- * Address-ordered first-fit has shown to perform slightly better than best-fit when it comes to
- * memory fragmentation, and slightly cheaper to implement. It is also used by some popular
- * allocators such as jemalloc.
- *
- * \see Johnstone, M. S., & Wilson, P. R. (1998). The memory fragmentation problem: Solved?. ACM
- * Sigplan Notices, 34(3), 26-36.
- *
- * @param free_blocks The address-ordered set of free blocks.
- * @param size The number of bytes to allocate.
- * @return block A block of memory of at least `size` bytes, or an empty block if not found.
- */
-inline block first_fit(std::set<block>& free_blocks, std::size_t size)
-{
-  auto fits = [size](auto const& blk) { return blk.fits(size); };
+  /**
+   * @brief Construct a default superblock.
+   */
+  superblock() = default;
 
-  auto const iter = std::find_if(free_blocks.cbegin(), free_blocks.cend(), fits);
+  /**
+   * @brief Construct a superblock given a pointer and size.
+   *
+   * @param pointer The address for the beginning of the superblock.
+   * @param size The size of the superblock.
+   */
+  superblock(void* pointer, std::size_t size) : byte_span{pointer, size}
+  {
+    RMM_LOGGING_ASSERT(size >= minimum_size);
+    RMM_LOGGING_ASSERT(size <= maximum_size);
+    free_blocks_.emplace(pointer, size);
+  }
 
-  if (iter == free_blocks.cend()) { return {}; }
-  // Remove the block from the free_list.
-  auto const blk  = *iter;
-  auto const next = free_blocks.erase(iter);
+  // Disable copy semantics.
+  superblock(superblock const&) = delete;
+  superblock& operator=(superblock const&) = delete;
+  // Allow move semantics.
+  superblock(superblock&&) noexcept = default;
+  superblock& operator=(superblock&&) noexcept = default;
 
-  if (blk.size() > size) {
-    // Split the block and put the remainder back.
-    auto const split = blk.split(size);
-    free_blocks.insert(next, split.second);
-    return split.first;
+  ~superblock() = default;
+
+  /**
+   * @brief Is this superblock empty?
+   *
+   * @return true if this superblock is empty.
+   */
+  [[nodiscard]] bool empty() const
+  {
+    RMM_LOGGING_ASSERT(is_valid());
+    return free_blocks_.size() == 1 && free_blocks_.cbegin()->size() == size();
   }
-  return blk;
-}
 
-/**
- * @brief Coalesce the given block with other free blocks.
- *
- * @param free_blocks The address-ordered set of free blocks.
- * @param b The block to coalesce.
- * @return block The coalesced block.
- */
-inline block coalesce_block(std::set<block>& free_blocks, block const& blk)
+  /**
+   * @brief Return the number of free blocks.
+   *
+   * @return the number of free blocks.
+   */
+  [[nodiscard]] std::size_t free_blocks() const
+  {
+    RMM_LOGGING_ASSERT(is_valid());
+    return free_blocks_.size();
+  }
+
+  /**
+   * @brief Whether this superblock contains the given block.
+   *
+   * @param blk The block to search for.
+   * @return true if the given block belongs to this superblock.
+   */
+  [[nodiscard]] bool contains(block const& blk) const
+  {
+    RMM_LOGGING_ASSERT(is_valid());
+    RMM_LOGGING_ASSERT(blk.is_valid());
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+    return pointer() <= blk.pointer() && pointer() + size() >= blk.pointer() + blk.size();
+  }
+
+  /**
+   * @brief Can this superblock fit `bytes` bytes?
+   *
+   * @param bytes The size in bytes to check for fit.
+   * @return true if this superblock can fit `bytes` bytes.
+   */
+  [[nodiscard]] bool fits(std::size_t bytes) const
+  {
+    RMM_LOGGING_ASSERT(is_valid());
+    return std::any_of(free_blocks_.cbegin(), free_blocks_.cend(), [bytes](auto const& blk) {
+      return blk.fits(bytes);
+    });
+  }
+
+  /**
+   * @brief Verifies whether this superblock can be merged to the beginning of superblock s.
+   *
+   * @param s The superblock to check for contiguity.
+   * @return true Returns true if both superblocks are empty and this superblock's
+   * `pointer` + `size` == `s.ptr`.
+   */
+  [[nodiscard]] bool is_contiguous_before(superblock const& sblk) const
+  {
+    RMM_LOGGING_ASSERT(is_valid());
+    RMM_LOGGING_ASSERT(sblk.is_valid());
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+    return empty() && sblk.empty() && pointer() + size() == sblk.pointer();
+  }
+
+  /**
+   * @brief Split this superblock into two by the given size.
+   *
+   * @param bytes The size in bytes of the first block.
+   * @return superblock_pair A pair of superblocks split by bytes.
+   */
+  [[nodiscard]] std::pair<superblock, superblock> split(std::size_t bytes) const
+  {
+    RMM_LOGGING_ASSERT(is_valid());
+    RMM_LOGGING_ASSERT(empty() && bytes >= minimum_size && size() >= bytes + minimum_size);
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+    return {superblock{pointer(), bytes}, superblock{pointer() + bytes, size() - bytes}};
+  }
+
+  /**
+   * @brief Coalesce two contiguous superblocks into one.
+   *
+   * `this->is_contiguous_before(s)` must be true.
+   *
+   * @param sblk superblock to merge.
+   * @return block The merged block.
+   */
+  [[nodiscard]] superblock merge(superblock const& sblk) const
+  {
+    RMM_LOGGING_ASSERT(is_contiguous_before(sblk));
+    return {pointer(), size() + sblk.size()};
+  }
+
+  /**
+   * @brief Get the first free block of at least `size` bytes.
+   *
+   * @param size The number of bytes to allocate.
+   * @return block A block of memory of at least `size` bytes, or an empty block if not found.
+   */
+  block first_fit(std::size_t size)
+  {
+    RMM_LOGGING_ASSERT(is_valid());
+    RMM_LOGGING_ASSERT(size > 0);
+
+    auto fits       = [size](auto const& blk) { return blk.fits(size); };
+    auto const iter = std::find_if(free_blocks_.cbegin(), free_blocks_.cend(), fits);
+    if (iter == free_blocks_.cend()) { return {}; }
+
+    // Remove the block from the free list.
+    auto const blk  = *iter;
+    auto const next = free_blocks_.erase(iter);
+
+    if (blk.size() > size) {
+      // Split the block and put the remainder back.
+      auto const split = blk.split(size);
+      free_blocks_.insert(next, split.second);
+      return split.first;
+    }
+    return blk;
+  }
+
+  /**
+   * @brief Coalesce the given block with other free blocks.
+   *
+   * @param blk The block to coalesce.
+   */
+  void coalesce(block const& blk)  // NOLINT(readability-function-cognitive-complexity)
+  {
+    RMM_LOGGING_ASSERT(is_valid());
+    RMM_LOGGING_ASSERT(blk.is_valid());
+    RMM_LOGGING_ASSERT(contains(blk));
+
+    // Find the right place (in ascending address order) to insert the block.
+    auto const next     = free_blocks_.lower_bound(blk);
+    auto const previous = next == free_blocks_.cbegin() ? next : std::prev(next);
+
+    // Coalesce with neighboring blocks.
+    bool const merge_prev = previous != free_blocks_.cend() && previous->is_contiguous_before(blk);
+    bool const merge_next = next != free_blocks_.cend() && blk.is_contiguous_before(*next);
+
+    if (merge_prev && merge_next) {
+      auto const merged = previous->merge(blk).merge(*next);
+      free_blocks_.erase(previous);
+      auto const iter = free_blocks_.erase(next);
+      free_blocks_.insert(iter, merged);
+    } else if (merge_prev) {
+      auto const merged = previous->merge(blk);
+      auto const iter   = free_blocks_.erase(previous);
+      free_blocks_.insert(iter, merged);
+    } else if (merge_next) {
+      auto const merged = blk.merge(*next);
+      auto const iter   = free_blocks_.erase(next);
+      free_blocks_.insert(iter, merged);
+    } else {
+      free_blocks_.insert(next, blk);
+    }
+  }
+
+  /**
+   * @brief Find the total free block size.
+   * @return the total free block size.
+   */
+  [[nodiscard]] std::size_t total_free_size() const { return total_memory_size(free_blocks_); }
+
+  /**
+   * @brief Find the max free block size.
+   * @return the max free block size.
+   */
+  [[nodiscard]] std::size_t max_free_size() const
+  {
+    if (free_blocks_.empty()) { return 0; }
+    return std::max_element(free_blocks_.cbegin(), free_blocks_.cend(), block_size_compare)->size();
+  }
+
+ private:
+  /// Address-ordered set of free blocks.
+  std::set<block> free_blocks_{};
+};
+
+/// Calculate the total free size of a set of superblocks.
+inline auto total_free_size(std::set<superblock> const& superblocks)
 {
-  if (!blk.is_valid()) { return blk; }
-
-  // Find the right place (in ascending address order) to insert the block.
-  auto const next     = free_blocks.lower_bound(blk);
-  auto const previous = next == free_blocks.cbegin() ? next : std::prev(next);
-
-  // Coalesce with neighboring blocks.
-  bool const merge_prev = previous->is_contiguous_before(blk);
-  bool const merge_next = next != free_blocks.cend() && blk.is_contiguous_before(*next);
-
-  block merged{};
-  if (merge_prev && merge_next) {
-    merged = previous->merge(blk).merge(*next);
-    free_blocks.erase(previous);
-    auto const iter = free_blocks.erase(next);
-    free_blocks.insert(iter, merged);
-  } else if (merge_prev) {
-    merged          = previous->merge(blk);
-    auto const iter = free_blocks.erase(previous);
-    free_blocks.insert(iter, merged);
-  } else if (merge_next) {
-    merged          = blk.merge(*next);
-    auto const iter = free_blocks.erase(next);
-    free_blocks.insert(iter, merged);
-  } else {
-    free_blocks.emplace(blk);
-    merged = blk;
-  }
-  return merged;
+  return std::accumulate(
+    superblocks.cbegin(), superblocks.cend(), std::size_t{}, [](auto const& lhs, auto const& rhs) {
+      return lhs + rhs.total_free_size();
+    });
 }
 
-template <typename T>
-inline auto total_block_size(T const& blocks)
+/// Find the max free size from a set of superblocks.
+inline auto max_free_size(std::set<superblock> const& superblocks)
 {
-  return std::accumulate(blocks.cbegin(), blocks.cend(), std::size_t{}, [](auto lhs, auto rhs) {
-    return lhs + rhs.size();
-  });
-}
+  std::size_t size{};
+  for (auto const& sblk : superblocks) {
+    size = std::max(size, sblk.max_free_size());
+  }
+  return size;
+};
 
 /**
  * @brief The global arena for allocating memory from the upstream memory resource.
@@ -254,51 +494,24 @@ inline auto total_block_size(T const& blocks)
 template <typename Upstream>
 class global_arena final {
  public:
-  /// The default initial size for the global arena.
-  static constexpr std::size_t default_initial_size = std::numeric_limits<std::size_t>::max();
-  /// The default maximum size for the global arena.
-  static constexpr std::size_t default_maximum_size = std::numeric_limits<std::size_t>::max();
-  /// Reserved memory that should not be allocated (64 MiB).
-  static constexpr std::size_t reserved_size = 1U << 26U;
-
   /**
    * @brief Construct a global arena.
    *
    * @throws rmm::logic_error if `upstream_mr == nullptr`.
-   * @throws rmm::logic_error if `initial_size` is neither the default nor aligned to a multiple of
-   * 256 bytes.
-   * @throws rmm::logic_error if `maximum_size` is neither the default nor aligned to a multiple of
-   * 256 bytes.
    *
    * @param upstream_mr The memory resource from which to allocate blocks for the pool
-   * @param initial_size Minimum size, in bytes, of the initial global arena. Defaults to half of
-   * the available memory on the current device.
-   * @param maximum_size Maximum size, in bytes, that the global arena can grow to. Defaults to all
-   * of the available memory on the current device.
+   * @param arena_size Size in bytes of the global arena. Defaults to half of the available memory
+   * on the current device.
    */
-  global_arena(Upstream* upstream_mr, std::size_t initial_size, std::size_t maximum_size)
-    : upstream_mr_{upstream_mr}, maximum_size_{maximum_size}
+  global_arena(Upstream* upstream_mr, std::optional<std::size_t> arena_size)
+    : upstream_mr_{upstream_mr}
   {
     RMM_EXPECTS(nullptr != upstream_mr_, "Unexpected null upstream pointer.");
-    RMM_EXPECTS(initial_size == default_initial_size || initial_size == align_up(initial_size),
-                "Error, Initial arena size required to be a multiple of 256 bytes");
-    RMM_EXPECTS(maximum_size_ == default_maximum_size || maximum_size_ == align_up(maximum_size_),
-                "Error, Maximum arena size required to be a multiple of 256 bytes");
-
-    if (initial_size == default_initial_size || maximum_size == default_maximum_size) {
-      std::size_t free{};
-      std::size_t total{};
-      RMM_CUDA_TRY(cudaMemGetInfo(&free, &total));
-      if (initial_size == default_initial_size) {
-        initial_size = align_up(std::min(free, total / 2));
-      }
-      if (maximum_size_ == default_maximum_size) {
-        maximum_size_ = align_down(free) - reserved_size;
-      }
-    }
-    RMM_EXPECTS(initial_size <= maximum_size_, "Initial arena size exceeds the maximum pool size!");
-
-    free_blocks_.emplace(expand_arena(initial_size));
+    auto const size = rmm::detail::align_down(arena_size.value_or(default_size()),
+                                              rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
+    RMM_EXPECTS(size >= superblock::minimum_size,
+                "Arena size smaller than minimum superblock size.");
+    initialize(size);
   }
 
   // Disable copy (and move) semantics.
@@ -313,48 +526,120 @@ class global_arena final {
    */
   ~global_arena()
   {
-    lock_guard lock(mtx_);
-    for (auto const& blk : upstream_blocks_) {
-      upstream_mr_->deallocate(blk.pointer(), blk.size());
-    }
+    std::lock_guard lock(mtx_);
+    upstream_mr_->deallocate(upstream_block_.pointer(), upstream_block_.size());
+  }
+
+  /**
+   * @brief Should allocation of `size` bytes be handled by the global arena directly?
+   *
+   * @param size The size in bytes of the allocation.
+   * @return bool True if the allocation should be handled by the global arena.
+   */
+  bool handles(std::size_t size) const { return size > superblock::minimum_size; }
+
+  /**
+   * @brief Acquire a superblock that can fit a block of the given size.
+   *
+   * @param size The size in bytes of the allocation.
+   * @return superblock The acquired superblock.
+   */
+  superblock acquire(std::size_t size)
+  {
+    // Superblocks should only be acquired if the size is not directly handled by the global arena.
+    RMM_LOGGING_ASSERT(!handles(size));
+    std::lock_guard lock(mtx_);
+    return first_fit(size);
+  }
+
+  /**
+   * @brief Release a superblock.
+   *
+   * @param s Superblock to be released.
+   */
+  void release(superblock&& sblk)
+  {
+    RMM_LOGGING_ASSERT(sblk.is_valid());
+    std::lock_guard lock(mtx_);
+    coalesce(std::move(sblk));
   }
 
   /**
-   * @brief Allocates memory of size at least `bytes`.
+   * @brief Release a set of superblocks from a dying arena.
    *
-   * @throws `std::bad_alloc` if the requested allocation could not be fulfilled.
+   * @param superblocks The set of superblocks.
+   */
+  void release(std::set<superblock>& superblocks)
+  {
+    std::lock_guard lock(mtx_);
+    while (!superblocks.empty()) {
+      auto sblk = std::move(superblocks.extract(superblocks.cbegin()).value());
+      RMM_LOGGING_ASSERT(sblk.is_valid());
+      coalesce(std::move(sblk));
+    }
+  }
+
+  /**
+   * @brief Allocate a large block directly.
    *
-   * @param bytes The size in bytes of the allocation.
+   * @param size The size in bytes of the allocation.
    * @return void* Pointer to the newly allocated memory.
    */
-  block allocate(std::size_t bytes)
+  void* allocate(std::size_t size)
   {
-    lock_guard lock(mtx_);
-    return get_block(bytes);
+    RMM_LOGGING_ASSERT(handles(size));
+    std::lock_guard lock(mtx_);
+    auto sblk = first_fit(size);
+    if (sblk.is_valid()) {
+      auto blk = sblk.first_fit(size);
+      superblocks_.insert(std::move(sblk));
+      return blk.pointer();
+    }
+    return nullptr;
   }
 
   /**
-   * @brief Deallocate memory pointed to by `blk`.
+   * @brief Deallocate memory pointed to by `ptr`.
    *
-   * @param blk Block to be deallocated.
+   * @param ptr Pointer to be deallocated.
+   * @param size The size in bytes of the allocation. This must be equal to the value of `size`
+   * that was passed to the `allocate` call that returned `p`.
+   * @param stream Stream on which to perform deallocation.
+   * @return bool true if the allocation is found, false otherwise.
    */
-  void deallocate(block const& blk)
+  bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream)
   {
-    lock_guard lock(mtx_);
-    coalesce_block(free_blocks_, blk);
+    RMM_LOGGING_ASSERT(handles(size));
+    stream.synchronize_no_throw();
+    return deallocate(ptr, size);
   }
 
   /**
-   * @brief Deallocate a set of free blocks from a dying arena.
+   * @brief Deallocate memory pointed to by `ptr`.
    *
-   * @param free_blocks The set of free blocks.
+   * @param ptr Pointer to be deallocated.
+   * @param bytes The size in bytes of the allocation. This must be equal to the
+   * value of `bytes` that was passed to the `allocate` call that returned `ptr`.
+   * @return bool true if the allocation is found, false otherwise.
    */
-  void deallocate(std::set<block> const& free_blocks)
+  bool deallocate(void* ptr, std::size_t bytes)
   {
-    lock_guard lock(mtx_);
-    for (auto const& blk : free_blocks) {
-      coalesce_block(free_blocks_, blk);
+    std::lock_guard lock(mtx_);
+
+    block const blk{ptr, bytes};
+    auto const iter = std::find_if(superblocks_.cbegin(),
+                                   superblocks_.cend(),
+                                   [&](auto const& sblk) { return sblk.contains(blk); });
+    if (iter == superblocks_.cend()) { return false; }
+
+    auto sblk = std::move(superblocks_.extract(iter).value());
+    sblk.coalesce(blk);
+    if (sblk.empty()) {
+      coalesce(std::move(sblk));
+    } else {
+      superblocks_.insert(std::move(sblk));
     }
+    return true;
   }
 
   /**
@@ -364,86 +649,136 @@ class global_arena final {
    */
   void dump_memory_log(std::shared_ptr<spdlog::logger> const& logger) const
   {
-    lock_guard lock(mtx_);
-
-    logger->info("  Maximum size: {}", rmm::detail::bytes{maximum_size_});
-    logger->info("  Current size: {}", rmm::detail::bytes{current_size_});
-
-    logger->info("  # free blocks: {}", free_blocks_.size());
-    if (!free_blocks_.empty()) {
-      logger->info("  Total size of free blocks: {}",
-                   rmm::detail::bytes{total_block_size(free_blocks_)});
-      auto const largest_free =
-        *std::max_element(free_blocks_.begin(), free_blocks_.end(), block_size_compare);
-      logger->info("  Size of largest free block: {}", rmm::detail::bytes{largest_free.size()});
+    std::lock_guard lock(mtx_);
+
+    logger->info("  Arena size: {}", rmm::detail::bytes{upstream_block_.size()});
+    logger->info("  # superblocks: {}", superblocks_.size());
+    if (!superblocks_.empty()) {
+      logger->debug("  Total size of superblocks: {}",
+                    rmm::detail::bytes{total_memory_size(superblocks_)});
+      auto const total_free    = total_free_size(superblocks_);
+      auto const max_free      = max_free_size(superblocks_);
+      auto const fragmentation = (1 - max_free / static_cast<double>(total_free)) * 100;
+      logger->info("  Total free memory: {}", rmm::detail::bytes{total_free});
+      logger->info("  Largest block of free memory: {}", rmm::detail::bytes{max_free});
+      logger->info("  Fragmentation: {:.2f}%", fragmentation);
+
+      auto index = 0;
+      char* prev_end{};
+      for (auto const& sblk : superblocks_) {
+        if (prev_end == nullptr) { prev_end = sblk.pointer(); }
+        logger->debug(
+          "    Superblock {}: start={}, end={}, size={}, empty={}, # free blocks={}, max free={}, "
+          "gap={}",
+          index,
+          fmt::ptr(sblk.pointer()),
+          fmt::ptr(sblk.end()),
+          rmm::detail::bytes{sblk.size()},
+          sblk.empty(),
+          sblk.free_blocks(),
+          rmm::detail::bytes{sblk.max_free_size()},
+          rmm::detail::bytes{static_cast<size_t>(sblk.pointer() - prev_end)});
+        prev_end = sblk.end();
+        index++;
+      }
     }
-
-    logger->info("  # upstream blocks={}", upstream_blocks_.size());
-    logger->info("  Total size of upstream blocks: {}",
-                 rmm::detail::bytes{total_block_size(upstream_blocks_)});
   }
 
  private:
-  using lock_guard = std::lock_guard<std::mutex>;
+  /**
+   * @brief Default size of the global arena if unspecified.
+   * @return the default global arena size.
+   */
+  constexpr std::size_t default_size() const
+  {
+    auto const [free, total] = rmm::detail::available_device_memory();
+    return free / 2;
+  }
 
   /**
-   * @brief Get an available memory block of at least `size` bytes.
+   * @brief Allocate space from upstream to initialize the arena.
    *
-   * @param size The number of bytes to allocate.
-   * @return block A block of memory of at least `size` bytes.
+   * @param size The size to allocate.
    */
-  block get_block(std::size_t size)
+  void initialize(std::size_t size)
   {
-    // Find the first-fit free block.
-    auto const blk = first_fit(free_blocks_, size);
-    if (blk.is_valid()) { return blk; }
-
-    // No existing larger blocks available, so grow the arena.
-    auto const upstream_block = expand_arena(size_to_grow(size));
-    coalesce_block(free_blocks_, upstream_block);
-    return first_fit(free_blocks_, size);
+    upstream_block_ = {upstream_mr_->allocate(size), size};
+    superblocks_.emplace(upstream_block_.pointer(), size);
   }
 
   /**
-   * @brief Get the size to grow the global arena given the requested `size` bytes.
+   * @brief Get the first superblock that can fit a block of at least `size` bytes.
+   *
+   * Address-ordered first-fit has shown to perform slightly better than best-fit when it comes to
+   * memory fragmentation, and slightly cheaper to implement. It is also used by some popular
+   * allocators such as jemalloc.
    *
-   * This simply grows the global arena to the maximum size.
+   * \see Johnstone, M. S., & Wilson, P. R. (1998). The memory fragmentation problem: Solved?. ACM
+   * Sigplan Notices, 34(3), 26-36.
    *
-   * @param size The number of bytes required.
-   * @return size The size for the arena to grow, or 0 if no more memory.
+   * @param size The number of bytes to allocate.
+   * @param minimum_size The minimum size of the superblock required.
+   * @return superblock A superblock that can fit at least `size` bytes, or empty if not found.
    */
-  constexpr std::size_t size_to_grow(std::size_t size) const
+  superblock first_fit(std::size_t size)
   {
-    if (current_size_ + size > maximum_size_) { return 0; }
-    return maximum_size_ - current_size_;
+    auto const iter = std::find_if(superblocks_.cbegin(),
+                                   superblocks_.cend(),
+                                   [=](auto const& sblk) { return sblk.fits(size); });
+    if (iter == superblocks_.cend()) { return {}; }
+
+    auto sblk           = std::move(superblocks_.extract(iter).value());
+    auto const min_size = std::max(superblock::minimum_size, size);
+    if (sblk.empty() && sblk.size() >= min_size + superblock::minimum_size) {
+      // Split the superblock and put the remainder back.
+      auto [head, tail] = sblk.split(min_size);
+      superblocks_.insert(std::move(tail));
+      return std::move(head);
+    }
+    return sblk;
   }
 
   /**
-   * @brief Allocate space from upstream to supply the arena and return a sufficiently sized block.
+   * @brief Coalesce the given superblock with other empty superblocks.
    *
-   * @param size The minimum size to allocate.
-   * @return block A block of at least `size` bytes.
+   * @param sblk The superblock to coalesce.
    */
-  block expand_arena(std::size_t size)
+  void coalesce(superblock&& sblk)
   {
-    if (size > 0) {
-      upstream_blocks_.push_back({upstream_mr_->allocate(size), size});
-      current_size_ += size;
-      return upstream_blocks_.back();
+    RMM_LOGGING_ASSERT(sblk.is_valid());
+
+    // Find the right place (in ascending address order) to insert the block.
+    auto const next     = superblocks_.lower_bound(sblk);
+    auto const previous = next == superblocks_.cbegin() ? next : std::prev(next);
+
+    // Coalesce with neighboring blocks.
+    bool const merge_prev = previous != superblocks_.cend() && previous->is_contiguous_before(sblk);
+    bool const merge_next = next != superblocks_.cend() && sblk.is_contiguous_before(*next);
+
+    if (merge_prev && merge_next) {
+      auto prev_sb = std::move(superblocks_.extract(previous).value());
+      auto next_sb = std::move(superblocks_.extract(next).value());
+      auto merged  = prev_sb.merge(sblk).merge(next_sb);
+      superblocks_.insert(std::move(merged));
+    } else if (merge_prev) {
+      auto prev_sb = std::move(superblocks_.extract(previous).value());
+      auto merged  = prev_sb.merge(sblk);
+      superblocks_.insert(std::move(merged));
+    } else if (merge_next) {
+      auto next_sb = std::move(superblocks_.extract(next).value());
+      auto merged  = sblk.merge(next_sb);
+      superblocks_.insert(std::move(merged));
+    } else {
+      superblocks_.insert(std::move(sblk));
     }
-    return {};
   }
 
   /// The upstream resource to allocate memory from.
   Upstream* upstream_mr_;
-  /// The maximum size the global arena can grow to.
-  std::size_t maximum_size_;
-  /// The current size of the global arena.
-  std::size_t current_size_{};
-  /// Address-ordered set of free blocks.
-  std::set<block> free_blocks_;
-  /// Blocks allocated from upstream so that they can be quickly freed.
-  std::vector<block> upstream_blocks_;
+  /// Block allocated from upstream so that it can be quickly freed.
+  block upstream_block_;
+  /// Address-ordered set of superblocks.
+  std::set<superblock> superblocks_;
   /// Mutex for exclusive lock.
   mutable std::mutex mtx_;
 };
@@ -452,7 +787,7 @@ class global_arena final {
  * @brief An arena for allocating memory for a thread.
  *
  * An arena is a per-thread or per-non-default-stream memory pool. It allocates
- * superblocks from the global arena, and return them when the superblocks become empty.
+ * superblocks from the global arena, and returns them when the superblocks become empty.
  *
  * @tparam Upstream Memory resource to use for allocating the global arena. Implements
  * rmm::mr::device_memory_resource interface.
@@ -467,78 +802,81 @@ class arena {
    */
   explicit arena(global_arena<Upstream>& global_arena) : global_arena_{global_arena} {}
 
-  ~arena() = default;
-
   // Disable copy (and move) semantics.
   arena(arena const&) = delete;
   arena& operator=(arena const&) = delete;
   arena(arena&&) noexcept        = delete;
   arena& operator=(arena&&) noexcept = delete;
 
+  ~arena() = default;
+
   /**
-   * @brief Allocates memory of size at least `bytes`.
-   *
-   * @throws `std::bad_alloc` if the requested allocation could not be fulfilled.
+   * @brief Allocates memory of size at least `size` bytes.
    *
-   * @param bytes The size in bytes of the allocation.
+   * @param size The size in bytes of the allocation.
    * @return void* Pointer to the newly allocated memory.
    */
-  void* allocate(std::size_t bytes)
+  void* allocate(std::size_t size)
   {
-    lock_guard lock(mtx_);
-    auto const blk = get_block(bytes);
-    return blk.pointer();
+    if (global_arena_.handles(size)) { return global_arena_.allocate(size); }
+    std::lock_guard lock(mtx_);
+    return get_block(size).pointer();
   }
 
   /**
    * @brief Deallocate memory pointed to by `ptr`, and possibly return superblocks to upstream.
    *
    * @param ptr Pointer to be deallocated.
-   * @param bytes The size in bytes of the allocation. This must be equal to the value of `bytes`
+   * @param size The size in bytes of the allocation. This must be equal to the value of `size`
    * that was passed to the `allocate` call that returned `p`.
    * @param stream Stream on which to perform deallocation.
+   * @return bool true if the allocation is found, false otherwise.
    */
-  void deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream)
+  bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream)
   {
-    lock_guard lock(mtx_);
-    block const blk{ptr, bytes};
-    auto const merged = coalesce_block(free_blocks_, blk);
-    shrink_arena(merged, stream);
+    if (global_arena_.handles(size) && global_arena_.deallocate(ptr, size, stream)) { return true; }
+    return deallocate(ptr, size);
   }
 
   /**
-   * @brief Clean the arena and deallocate free blocks from the global arena.
+   * @brief Deallocate memory pointed to by `ptr`, and possibly return superblocks to upstream.
+   *
+   * @param ptr Pointer to be deallocated.
+   * @param size The size in bytes of the allocation. This must be equal to the value of `size`
+   * that was passed to the `allocate` call that returned `p`.
+   * @return bool true if the allocation is found, false otherwise.
+   */
+  bool deallocate(void* ptr, std::size_t size)
+  {
+    std::lock_guard lock(mtx_);
+    return deallocate_from_superblock({ptr, size});
+  }
+
+  /**
+   * @brief Clean the arena and release all superblocks to the global arena.
    */
   void clean()
   {
-    lock_guard lock(mtx_);
-    global_arena_.deallocate(free_blocks_);
-    free_blocks_.clear();
+    std::lock_guard lock(mtx_);
+    global_arena_.release(superblocks_);
+    superblocks_.clear();
   }
 
   /**
-   * Dump memory to log.
-   *
-   * @param logger the spdlog logger to use
+   * @brief Defragment the arena and release empty superblock to the global arena.
    */
-  void dump_memory_log(std::shared_ptr<spdlog::logger> const& logger) const
+  void defragment()
   {
-    lock_guard lock(mtx_);
-    logger->info("    # free blocks: {}", free_blocks_.size());
-    if (!free_blocks_.empty()) {
-      logger->info("    Total size of free blocks: {}",
-                   rmm::detail::bytes{total_block_size(free_blocks_)});
-      auto const largest_free =
-        *std::max_element(free_blocks_.begin(), free_blocks_.end(), block_size_compare);
-      logger->info("    Size of largest free block: {}", rmm::detail::bytes{largest_free.size()});
+    std::lock_guard lock(mtx_);
+    while (true) {
+      auto const iter = std::find_if(
+        superblocks_.cbegin(), superblocks_.cend(), [](auto const& sblk) { return sblk.empty(); });
+      if (iter == superblocks_.cend()) { return; }
+      global_arena_.release(std::move(superblocks_.extract(iter).value()));
     }
   }
 
  private:
-  using lock_guard = std::lock_guard<std::mutex>;
-  /// Maximum number of free blocks to keep.
-  static constexpr int max_free_blocks = 16;
-
   /**
    * @brief Get an available memory block of at least `size` bytes.
    *
@@ -547,51 +885,82 @@ class arena {
    */
   block get_block(std::size_t size)
   {
-    if (size < minimum_superblock_size) {
-      // Find the first-fit free block.
-      auto const blk = first_fit(free_blocks_, size);
-      if (blk.is_valid()) { return blk; }
-    }
+    // Find the first-fit free block.
+    auto const blk = first_fit(size);
+    if (blk.is_valid()) { return blk; }
 
     // No existing larger blocks available, so grow the arena and obtain a superblock.
-    auto const superblock = expand_arena(size);
-    if (superblock.is_valid()) {
-      coalesce_block(free_blocks_, superblock);
-      return first_fit(free_blocks_, size);
-    }
-    return superblock;
+    return expand_arena(size);
   }
 
   /**
-   * @brief Allocate space from upstream to supply the arena and return a superblock.
+   * @brief Get the first free block of at least `size` bytes.
    *
-   * @return A superblock.
+   * Address-ordered first-fit has shown to perform slightly better than best-fit when it comes to
+   * memory fragmentation, and slightly cheaper to implement. It is also used by some popular
+   * allocators such as jemalloc.
+   *
+   * \see Johnstone, M. S., & Wilson, P. R. (1998). The memory fragmentation problem: Solved?. ACM
+   * Sigplan Notices, 34(3), 26-36.
+   *
+   * @param size The number of bytes to allocate.
+   * @return block A block of memory of at least `size` bytes, or an empty block if not found.
    */
-  block expand_arena(std::size_t size)
+  block first_fit(std::size_t size)
+  {
+    auto const iter = std::find_if(superblocks_.cbegin(),
+                                   superblocks_.cend(),
+                                   [size](auto const& sblk) { return sblk.fits(size); });
+    if (iter == superblocks_.cend()) { return {}; }
+
+    auto sblk      = std::move(superblocks_.extract(iter).value());
+    auto const blk = sblk.first_fit(size);
+    superblocks_.insert(std::move(sblk));
+    return blk;
+  }
+
+  /**
+   * @brief Deallocate a block from the superblock it belongs to.
+   *
+   * @param blk The block to deallocate.
+   * @param stream The stream to use for deallocation.
+   * @return true if the block is found.
+   */
+  bool deallocate_from_superblock(block const& blk)
   {
-    auto const superblock_size = std::max(size, minimum_superblock_size);
-    return global_arena_.allocate(superblock_size);
+    auto const iter = std::find_if(superblocks_.cbegin(),
+                                   superblocks_.cend(),
+                                   [&](auto const& sblk) { return sblk.contains(blk); });
+    if (iter == superblocks_.cend()) { return false; }
+
+    auto sblk = std::move(superblocks_.extract(iter).value());
+    sblk.coalesce(blk);
+    superblocks_.insert(std::move(sblk));
+    return true;
   }
 
   /**
-   * @brief Shrink this arena by returning free superblocks to upstream.
+   * @brief Allocate space from upstream to supply the arena and return a block.
    *
-   * @param blk The block that can be used to shrink the arena.
-   * @param stream Stream on which to perform shrinking.
+   * @param size The number of bytes to allocate.
+   * @return block A block of memory of at least `size` bytes.
    */
-  void shrink_arena(block const& blk, cuda_stream_view stream)
+  block expand_arena(std::size_t size)
   {
-    if (blk.is_superblock() || free_blocks_.size() > max_free_blocks) {
-      stream.synchronize_no_throw();
-      global_arena_.deallocate(blk);
-      free_blocks_.erase(blk);
+    auto sblk = global_arena_.acquire(size);
+    if (sblk.is_valid()) {
+      RMM_LOGGING_ASSERT(sblk.size() >= superblock::minimum_size);
+      auto const blk = sblk.first_fit(size);
+      superblocks_.insert(std::move(sblk));
+      return blk;
     }
+    return {};
   }
 
   /// The global arena to allocate superblocks from.
   global_arena<Upstream>& global_arena_;
-  /// Free blocks.
-  std::set<block> free_blocks_;
+  /// Acquired superblocks.
+  std::set<superblock> superblocks_;
   /// Mutex for exclusive lock.
   mutable std::mutex mtx_;
 };
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index d9499802d..b86e2457c 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,51 +15,527 @@
  */
 
 #include "../../byte_literals.hpp"
-#include "rmm/cuda_stream_view.hpp"
-
-#include <benchmarks/utilities/log_parser.hpp>
-
+#include <rmm/cuda_stream.hpp>
+#include <rmm/detail/aligned.hpp>
+#include <rmm/detail/cuda_util.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
-#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include <memory>
 #include <sys/stat.h>
 
 #include <thread>
 #include <vector>
 
 namespace rmm::test {
-
 namespace {
-using cuda_mr  = rmm::mr::cuda_memory_resource;
-using arena_mr = rmm::mr::arena_memory_resource<rmm::mr::cuda_memory_resource>;
 
-TEST(ArenaTest, ThrowOnNullUpstream)
+class mock_memory_resource {
+ public:
+  MOCK_METHOD(void*, allocate, (std::size_t));
+  MOCK_METHOD(void, deallocate, (void*, std::size_t));
+};
+
+using rmm::mr::detail::arena::block;
+using rmm::mr::detail::arena::byte_span;
+using rmm::mr::detail::arena::superblock;
+using global_arena = rmm::mr::detail::arena::global_arena<mock_memory_resource>;
+using arena        = rmm::mr::detail::arena::arena<mock_memory_resource>;
+using arena_mr     = rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>;
+using ::testing::Return;
+
+// NOLINTBEGIN(cppcoreguidelines-pro-type-reinterpret-cast,performance-no-int-to-ptr)
+auto const fake_address  = reinterpret_cast<void*>(1_KiB);
+auto const fake_address2 = reinterpret_cast<void*>(2_KiB);
+auto const fake_address3 = reinterpret_cast<void*>(superblock::minimum_size);
+auto const fake_address4 = reinterpret_cast<void*>(superblock::minimum_size * 2);
+// NOLINTEND(cppcoreguidelines-pro-type-reinterpret-cast,performance-no-int-to-ptr)
+
+struct ArenaTest : public ::testing::Test {
+  void SetUp() override
+  {
+    EXPECT_CALL(mock_mr, allocate(arena_size)).WillOnce(Return(fake_address3));
+    EXPECT_CALL(mock_mr, deallocate(fake_address3, arena_size));
+    global     = std::make_unique<global_arena>(&mock_mr, arena_size);
+    per_thread = std::make_unique<arena>(*global);
+  }
+
+  std::size_t arena_size{superblock::minimum_size * 4};
+  mock_memory_resource mock_mr{};
+  std::unique_ptr<global_arena> global{};
+  std::unique_ptr<arena> per_thread{};
+};
+
+/**
+ * Test align_to_size_class.
+ */
+TEST_F(ArenaTest, AlignToSizeClass)  // NOLINT
+{
+  using rmm::mr::detail::arena::align_to_size_class;
+  EXPECT_EQ(align_to_size_class(8), 256);
+  EXPECT_EQ(align_to_size_class(256), 256);
+  EXPECT_EQ(align_to_size_class(264), 512);
+  EXPECT_EQ(align_to_size_class(512), 512);
+  EXPECT_EQ(align_to_size_class(17_KiB), 20_KiB);
+  EXPECT_EQ(align_to_size_class(13_MiB), 14_MiB);
+  EXPECT_EQ(align_to_size_class(2500_MiB), 2560_MiB);
+  EXPECT_EQ(align_to_size_class(128_GiB), 128_GiB);
+  EXPECT_EQ(align_to_size_class(1_PiB), std::numeric_limits<std::size_t>::max());
+}
+
+/**
+ * Test byte_span.
+ */
+
+TEST_F(ArenaTest, ByteSpan)  // NOLINT
+{
+  byte_span const span{};
+  EXPECT_FALSE(span.is_valid());
+  byte_span const span2{fake_address, 256};
+  EXPECT_TRUE(span2.is_valid());
+}
+
+/**
+ * Test block.
+ */
+
+TEST_F(ArenaTest, BlockFits)  // NOLINT
+{
+  block const blk{fake_address, 1_KiB};
+  EXPECT_TRUE(blk.fits(1_KiB));
+  EXPECT_FALSE(blk.fits(1_KiB + 1));
+}
+
+TEST_F(ArenaTest, BlockIsContiguousBefore)  // NOLINT
+{
+  block const blk{fake_address, 1_KiB};
+  block const blk2{fake_address2, 256};
+  EXPECT_TRUE(blk.is_contiguous_before(blk2));
+  block const blk3{fake_address, 512};
+  block const blk4{fake_address2, 1_KiB};
+  EXPECT_FALSE(blk3.is_contiguous_before(blk4));
+}
+
+TEST_F(ArenaTest, BlockSplit)  // NOLINT
+{
+  block const blk{fake_address, 2_KiB};
+  auto const [head, tail] = blk.split(1_KiB);
+  EXPECT_EQ(head.pointer(), fake_address);
+  EXPECT_EQ(head.size(), 1_KiB);
+  EXPECT_EQ(tail.pointer(), fake_address2);
+  EXPECT_EQ(tail.size(), 1_KiB);
+}
+
+TEST_F(ArenaTest, BlockMerge)  // NOLINT
+{
+  block const blk{fake_address, 1_KiB};
+  block const blk2{fake_address2, 1_KiB};
+  auto const merged = blk.merge(blk2);
+  EXPECT_EQ(merged.pointer(), fake_address);
+  EXPECT_EQ(merged.size(), 2_KiB);
+}
+
+/**
+ * Test superblock.
+ */
+
+TEST_F(ArenaTest, SuperblockEmpty)  // NOLINT
+{
+  superblock sblk{fake_address3, superblock::minimum_size};
+  EXPECT_TRUE(sblk.empty());
+  sblk.first_fit(256);
+  EXPECT_FALSE(sblk.empty());
+}
+
+TEST_F(ArenaTest, SuperblockContains)  // NOLINT
+{
+  superblock const sblk{fake_address3, superblock::minimum_size};
+  block const blk{fake_address, 2_KiB};
+  EXPECT_FALSE(sblk.contains(blk));
+  block const blk2{fake_address3, 1_KiB};
+  EXPECT_TRUE(sblk.contains(blk2));
+  block const blk3{fake_address3, superblock::minimum_size + 1};
+  EXPECT_FALSE(sblk.contains(blk3));
+  block const blk4{fake_address3, superblock::minimum_size};
+  EXPECT_TRUE(sblk.contains(blk4));
+  block const blk5{fake_address4, 256};
+  EXPECT_FALSE(sblk.contains(blk5));
+}
+
+TEST_F(ArenaTest, SuperblockFits)  // NOLINT
+{
+  superblock sblk{fake_address3, superblock::minimum_size};
+  EXPECT_TRUE(sblk.fits(superblock::minimum_size));
+  EXPECT_FALSE(sblk.fits(superblock::minimum_size + 1));
+
+  auto const blk = sblk.first_fit(superblock::minimum_size / 4);
+  sblk.first_fit(superblock::minimum_size / 4);
+  sblk.coalesce(blk);
+  EXPECT_TRUE(sblk.fits(superblock::minimum_size / 2));
+  EXPECT_FALSE(sblk.fits(superblock::minimum_size / 2 + 1));
+}
+
+TEST_F(ArenaTest, SuperblockIsContiguousBefore)  // NOLINT
+{
+  superblock sblk{fake_address3, superblock::minimum_size};
+  superblock sb2{fake_address4, superblock::minimum_size};
+  EXPECT_TRUE(sblk.is_contiguous_before(sb2));
+
+  auto const blk = sblk.first_fit(256);
+  EXPECT_FALSE(sblk.is_contiguous_before(sb2));
+  sblk.coalesce(blk);
+  EXPECT_TRUE(sblk.is_contiguous_before(sb2));
+
+  auto const blk2 = sb2.first_fit(1_KiB);
+  EXPECT_FALSE(sblk.is_contiguous_before(sb2));
+  sb2.coalesce(blk2);
+  EXPECT_TRUE(sblk.is_contiguous_before(sb2));
+}
+
+TEST_F(ArenaTest, SuperblockSplit)  // NOLINT
+{
+  superblock sblk{fake_address3, superblock::minimum_size * 2};
+  auto const [head, tail] = sblk.split(superblock::minimum_size);
+  EXPECT_EQ(head.pointer(), fake_address3);
+  EXPECT_EQ(head.size(), superblock::minimum_size);
+  EXPECT_TRUE(head.empty());
+  EXPECT_EQ(tail.pointer(), fake_address4);
+  EXPECT_EQ(tail.size(), superblock::minimum_size);
+  EXPECT_TRUE(tail.empty());
+}
+
+TEST_F(ArenaTest, SuperblockMerge)  // NOLINT
+{
+  superblock sblk{fake_address3, superblock::minimum_size};
+  superblock sb2{fake_address4, superblock::minimum_size};
+  auto const merged = sblk.merge(sb2);
+  EXPECT_EQ(merged.pointer(), fake_address3);
+  EXPECT_EQ(merged.size(), superblock::minimum_size * 2);
+  EXPECT_TRUE(merged.empty());
+}
+
+TEST_F(ArenaTest, SuperblockFirstFit)  // NOLINT
+{
+  superblock sblk{fake_address3, superblock::minimum_size};
+  auto const blk = sblk.first_fit(1_KiB);
+  EXPECT_EQ(blk.pointer(), fake_address3);
+  EXPECT_EQ(blk.size(), 1_KiB);
+  auto const blk2 = sblk.first_fit(2_KiB);
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+  EXPECT_EQ(blk2.pointer(), static_cast<char*>(fake_address3) + 1_KiB);
+  EXPECT_EQ(blk2.size(), 2_KiB);
+  sblk.coalesce(blk);
+  auto const blk3 = sblk.first_fit(512);
+  EXPECT_EQ(blk3.pointer(), fake_address3);
+  EXPECT_EQ(blk3.size(), 512);
+}
+
+TEST_F(ArenaTest, SuperblockCoalesceAfterFull)  // NOLINT
+{
+  superblock sblk{fake_address3, superblock::minimum_size};
+  auto const blk = sblk.first_fit(superblock::minimum_size / 2);
+  sblk.first_fit(superblock::minimum_size / 2);
+  sblk.coalesce(blk);
+  EXPECT_TRUE(sblk.first_fit(superblock::minimum_size / 2).is_valid());
+}
+
+TEST_F(ArenaTest, SuperblockCoalesceMergeNext)  // NOLINT
+{
+  superblock sblk{fake_address3, superblock::minimum_size};
+  auto const blk = sblk.first_fit(superblock::minimum_size / 2);
+  sblk.coalesce(blk);
+  EXPECT_TRUE(sblk.first_fit(superblock::minimum_size).is_valid());
+}
+
+TEST_F(ArenaTest, SuperblockCoalesceMergePrevious)  // NOLINT
+{
+  superblock sblk{fake_address3, superblock::minimum_size};
+  auto const blk  = sblk.first_fit(1_KiB);
+  auto const blk2 = sblk.first_fit(1_KiB);
+  sblk.first_fit(1_KiB);
+  sblk.coalesce(blk);
+  sblk.coalesce(blk2);
+  auto const blk3 = sblk.first_fit(2_KiB);
+  EXPECT_EQ(blk3.pointer(), fake_address3);
+}
+
+TEST_F(ArenaTest, SuperblockCoalesceMergePreviousAndNext)  // NOLINT
+{
+  superblock sblk{fake_address3, superblock::minimum_size};
+  auto const blk  = sblk.first_fit(1_KiB);
+  auto const blk2 = sblk.first_fit(1_KiB);
+  sblk.coalesce(blk);
+  sblk.coalesce(blk2);
+  EXPECT_TRUE(sblk.first_fit(superblock::minimum_size).is_valid());
+}
+
+TEST_F(ArenaTest, SuperblockMaxFreeSize)  // NOLINT
+{
+  superblock sblk{fake_address3, superblock::minimum_size};
+  auto const blk = sblk.first_fit(superblock::minimum_size / 4);
+  sblk.first_fit(superblock::minimum_size / 4);
+  sblk.coalesce(blk);
+  EXPECT_EQ(sblk.max_free_size(), superblock::minimum_size / 2);
+}
+
+TEST_F(ArenaTest, SuperblockMaxFreeSizeWhenFull)  // NOLINT
+{
+  superblock sblk{fake_address3, superblock::minimum_size};
+  sblk.first_fit(superblock::minimum_size);
+  EXPECT_EQ(sblk.max_free_size(), 0);
+}
+
+/**
+ * Test global_arena.
+ */
+
+TEST_F(ArenaTest, GlobalArenaNullUpstream)  // NOLINT
+{
+  auto construct_nullptr = []() { global_arena global{nullptr, std::nullopt}; };
+  EXPECT_THROW(construct_nullptr(), rmm::logic_error);  // NOLINT(cppcoreguidelines-avoid-goto)
+}
+
+TEST_F(ArenaTest, GlobalArenaAcquire)  // NOLINT
+{
+  auto const sblk = global->acquire(256);
+  EXPECT_EQ(sblk.pointer(), fake_address3);
+  EXPECT_EQ(sblk.size(), superblock::minimum_size);
+  EXPECT_TRUE(sblk.empty());
+
+  auto const sb2 = global->acquire(1_KiB);
+  EXPECT_EQ(sb2.pointer(), fake_address4);
+  EXPECT_EQ(sb2.size(), superblock::minimum_size);
+  EXPECT_TRUE(sb2.empty());
+
+  global->acquire(512);
+  global->acquire(512);
+  EXPECT_FALSE(global->acquire(512).is_valid());
+}
+
+TEST_F(ArenaTest, GlobalArenaReleaseMergeNext)  // NOLINT
+{
+  auto sblk = global->acquire(256);
+  global->release(std::move(sblk));
+  auto* ptr = global->allocate(arena_size);
+  EXPECT_EQ(ptr, fake_address3);
+}
+
+TEST_F(ArenaTest, GlobalArenaReleaseMergePrevious)  // NOLINT
+{
+  auto sblk = global->acquire(256);
+  auto sb2  = global->acquire(1_KiB);
+  global->acquire(512);
+  global->release(std::move(sblk));
+  global->release(std::move(sb2));
+  auto* ptr = global->allocate(superblock::minimum_size * 2);
+  EXPECT_EQ(ptr, fake_address3);
+}
+
+TEST_F(ArenaTest, GlobalArenaReleaseMergePreviousAndNext)  // NOLINT
+{
+  auto sblk = global->acquire(256);
+  auto sb2  = global->acquire(1_KiB);
+  auto sb3  = global->acquire(512);
+  global->release(std::move(sblk));
+  global->release(std::move(sb3));
+  global->release(std::move(sb2));
+  auto* ptr = global->allocate(arena_size);
+  EXPECT_EQ(ptr, fake_address3);
+}
+
+TEST_F(ArenaTest, GlobalArenaReleaseMultiple)  // NOLINT
+{
+  std::set<superblock> superblocks{};
+  auto sblk = global->acquire(256);
+  superblocks.insert(std::move(sblk));
+  auto sb2 = global->acquire(1_KiB);
+  superblocks.insert(std::move(sb2));
+  auto sb3 = global->acquire(512);
+  superblocks.insert(std::move(sb3));
+  global->release(superblocks);
+  auto* ptr = global->allocate(arena_size);
+  EXPECT_EQ(ptr, fake_address3);
+}
+
+TEST_F(ArenaTest, GlobalArenaAllocate)  // NOLINT
+{
+  auto* ptr = global->allocate(superblock::minimum_size * 2);
+  EXPECT_EQ(ptr, fake_address3);
+}
+
+TEST_F(ArenaTest, GlobalArenaAllocateExtraLarge)  // NOLINT
+{
+  EXPECT_EQ(global->allocate(1_PiB), nullptr);
+  EXPECT_EQ(global->allocate(1_PiB), nullptr);
+}
+
+TEST_F(ArenaTest, GlobalArenaDeallocate)  // NOLINT
+{
+  auto* ptr = global->allocate(superblock::minimum_size * 2);
+  EXPECT_EQ(ptr, fake_address3);
+  global->deallocate(ptr, superblock::minimum_size * 2, {});
+  ptr = global->allocate(superblock::minimum_size * 2);
+  EXPECT_EQ(ptr, fake_address3);
+}
+
+TEST_F(ArenaTest, GlobalArenaDeallocateAlignUp)  // NOLINT
+{
+  auto* ptr  = global->allocate(superblock::minimum_size + 256);
+  auto* ptr2 = global->allocate(superblock::minimum_size + 512);
+  global->deallocate(ptr, superblock::minimum_size + 256, {});
+  global->deallocate(ptr2, superblock::minimum_size + 512, {});
+  EXPECT_EQ(global->allocate(arena_size), fake_address3);
+}
+
+TEST_F(ArenaTest, GlobalArenaDeallocateFromOtherArena)  // NOLINT
+{
+  auto sblk       = global->acquire(512);
+  auto const blk  = sblk.first_fit(512);
+  auto const blk2 = sblk.first_fit(1024);
+  global->release(std::move(sblk));
+  global->deallocate(blk.pointer(), blk.size());
+  global->deallocate(blk2.pointer(), blk2.size());
+  EXPECT_EQ(global->allocate(arena_size), fake_address3);
+}
+
+/**
+ * Test arena.
+ */
+
+TEST_F(ArenaTest, ArenaAllocate)  // NOLINT
+{
+  EXPECT_EQ(per_thread->allocate(superblock::minimum_size), fake_address3);
+  EXPECT_EQ(per_thread->allocate(256), fake_address4);
+}
+
+TEST_F(ArenaTest, ArenaDeallocate)  // NOLINT
+{
+  auto* ptr = per_thread->allocate(superblock::minimum_size);
+  per_thread->deallocate(ptr, superblock::minimum_size, {});
+  auto* ptr2 = per_thread->allocate(256);
+  per_thread->deallocate(ptr2, 256, {});
+  EXPECT_EQ(per_thread->allocate(superblock::minimum_size), fake_address3);
+}
+
+TEST_F(ArenaTest, ArenaDeallocateMergePrevious)  // NOLINT
+{
+  auto* ptr  = per_thread->allocate(256);
+  auto* ptr2 = per_thread->allocate(256);
+  per_thread->allocate(256);
+  per_thread->deallocate(ptr, 256, {});
+  per_thread->deallocate(ptr2, 256, {});
+  EXPECT_EQ(per_thread->allocate(512), fake_address3);
+}
+
+TEST_F(ArenaTest, ArenaDeallocateMergeNext)  // NOLINT
+{
+  auto* ptr  = per_thread->allocate(256);
+  auto* ptr2 = per_thread->allocate(256);
+  per_thread->allocate(256);
+  per_thread->deallocate(ptr2, 256, {});
+  per_thread->deallocate(ptr, 256, {});
+  EXPECT_EQ(per_thread->allocate(512), fake_address3);
+}
+
+TEST_F(ArenaTest, ArenaDeallocateMergePreviousAndNext)  // NOLINT
+{
+  auto* ptr  = per_thread->allocate(256);
+  auto* ptr2 = per_thread->allocate(256);
+  per_thread->deallocate(ptr, 256, {});
+  per_thread->deallocate(ptr2, 256, {});
+  EXPECT_EQ(per_thread->allocate(2_KiB), fake_address3);
+}
+
+TEST_F(ArenaTest, ArenaDefragment)  // NOLINT
+{
+  std::vector<void*> pointers;
+  std::size_t num_pointers{4};
+  for (std::size_t i = 0; i < num_pointers; i++) {
+    pointers.push_back(per_thread->allocate(superblock::minimum_size));
+  }
+  for (auto* ptr : pointers) {
+    per_thread->deallocate(ptr, superblock::minimum_size, {});
+  }
+  EXPECT_EQ(global->allocate(arena_size), nullptr);
+  per_thread->defragment();
+  EXPECT_EQ(global->allocate(arena_size), fake_address3);
+}
+
+/**
+ * Test arena_memory_resource.
+ */
+
+TEST_F(ArenaTest, ThrowOnNullUpstream)  // NOLINT
 {
   auto construct_nullptr = []() { arena_mr mr{nullptr}; };
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
   EXPECT_THROW(construct_nullptr(), rmm::logic_error);
 }
 
-TEST(ArenaTest, ThrowMaxLessThanInitial)
+TEST_F(ArenaTest, SizeSmallerThanSuperblockSize)  // NOLINT
 {
-  // Make sure first argument is enough larger than the second that alignment rounding doesn't
-  // make them equal
-  auto max_less_than_initial = []() {
-    const auto initial{4_MiB};
-    const auto maximum{2_MiB};
-    cuda_mr cuda;
-    arena_mr mr{&cuda, initial, maximum};
-  };
-  EXPECT_THROW(max_less_than_initial(), rmm::logic_error);
+  auto construct_small = []() { arena_mr mr{rmm::mr::get_current_device_resource(), 256}; };
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
+  EXPECT_THROW(construct_small(), rmm::logic_error);
+}
+
+TEST_F(ArenaTest, AllocateNinetyPercent)  // NOLINT
+{
+  EXPECT_NO_THROW([]() {  // NOLINT(cppcoreguidelines-avoid-goto)
+    auto const free = rmm::detail::available_device_memory().first;
+    auto const ninety_percent =
+      rmm::detail::align_up(static_cast<std::size_t>(static_cast<double>(free) * 0.9),
+                            rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
+    arena_mr mr(rmm::mr::get_current_device_resource(), ninety_percent);
+  }());
+}
+
+TEST_F(ArenaTest, SmallMediumLarge)  // NOLINT
+{
+  EXPECT_NO_THROW([]() {  // NOLINT(cppcoreguidelines-avoid-goto)
+    arena_mr mr(rmm::mr::get_current_device_resource());
+    auto* small     = mr.allocate(256);
+    auto* medium    = mr.allocate(64_MiB);
+    auto const free = rmm::detail::available_device_memory().first;
+    auto* large     = mr.allocate(free / 3);
+    mr.deallocate(small, 256);
+    mr.deallocate(medium, 64_MiB);
+    mr.deallocate(large, free / 3);
+  }());
+}
+
+TEST_F(ArenaTest, Defragment)  // NOLINT
+{
+  EXPECT_NO_THROW([]() {  // NOLINT(cppcoreguidelines-avoid-goto)
+    auto const arena_size = superblock::minimum_size * 4;
+    arena_mr mr(rmm::mr::get_current_device_resource(), arena_size);
+    std::vector<std::thread> threads;
+    std::size_t num_threads{4};
+    threads.reserve(num_threads);
+    for (std::size_t i = 0; i < num_threads; ++i) {
+      threads.emplace_back(std::thread([&] {
+        cuda_stream stream{};
+        void* ptr = mr.allocate(32_KiB, stream);
+        mr.deallocate(ptr, 32_KiB, stream);
+      }));
+    }
+    for (auto& thread : threads) {
+      thread.join();
+    }
+
+    auto* ptr = mr.allocate(arena_size);
+    mr.deallocate(ptr, arena_size);
+  }());
 }
 
-TEST(ArenaTest, DumpLogOnFailure)
+TEST_F(ArenaTest, DumpLogOnFailure)  // NOLINT
 {
-  cuda_mr cuda;
-  arena_mr mr{&cuda, 1_MiB, 4_MiB, true};
+  arena_mr mr{rmm::mr::get_current_device_resource(), 1_MiB, true};
 
   {  // make the log interesting
     std::vector<std::thread> threads;
@@ -77,6 +553,7 @@ TEST(ArenaTest, DumpLogOnFailure)
     }
   }
 
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
   EXPECT_THROW(mr.allocate(8_MiB), rmm::out_of_memory);
 
   struct stat file_status {
@@ -85,10 +562,9 @@ TEST(ArenaTest, DumpLogOnFailure)
   EXPECT_GE(file_status.st_size, 0);
 }
 
-TEST(ArenaTest, FeatureSupport)
+TEST_F(ArenaTest, FeatureSupport)  // NOLINT
 {
-  cuda_mr cuda;
-  arena_mr mr{&cuda, 1_MiB, 4_MiB};
+  arena_mr mr{rmm::mr::get_current_device_resource(), 1_MiB};
   EXPECT_TRUE(mr.supports_streams());
   EXPECT_FALSE(mr.supports_get_mem_info());
   auto [free, total] = mr.get_mem_info(rmm::cuda_stream_default);