diff --git a/benchmarks/random_allocations/random_allocations.cpp b/benchmarks/random_allocations/random_allocations.cpp index 828561dd1..470442830 100644 --- a/benchmarks/random_allocations/random_allocations.cpp +++ b/benchmarks/random_allocations/random_allocations.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -170,7 +170,9 @@ inline auto make_pool() inline auto make_arena() { - return rmm::mr::make_owning_wrapper(make_cuda()); + auto free = rmm::detail::available_device_memory().first; + constexpr auto reserve{64UL << 20}; // Leave some space for CUDA overhead. + return rmm::mr::make_owning_wrapper(make_cuda(), free - reserve); } inline auto make_binning() diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp index b6f15f36f..f1b4e40c4 100644 --- a/include/rmm/mr/device/arena_memory_resource.hpp +++ b/include/rmm/mr/device/arena_memory_resource.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -78,26 +78,21 @@ class arena_memory_resource final : public device_memory_resource { * @brief Construct an `arena_memory_resource`. * * @throws rmm::logic_error if `upstream_mr == nullptr`. - * @throws rmm::logic_error if `initial_size` is neither the default nor aligned to a multiple of - * 256 bytes. - * @throws rmm::logic_error if `maximum_size` is neither the default nor aligned to a multiple of - * 256 bytes. * - * @param upstream_mr The memory resource from which to allocate blocks for the pool - * @param initial_size Minimum size, in bytes, of the initial global arena. Defaults to half of - * the available memory on the current device. - * @param maximum_size Maximum size, in bytes, that the global arena can grow to. Defaults to all - * of the available memory on the current device. + * @param upstream_mr The memory resource from which to allocate blocks for the global arena. + * @param arena_size Size in bytes of the global arena. Defaults to half of the available memory + * on the current device. + * @param dump_log_on_failure If true, dump memory log when running out of memory. */ explicit arena_memory_resource(Upstream* upstream_mr, - std::size_t initial_size = global_arena::default_initial_size, - std::size_t maximum_size = global_arena::default_maximum_size, - bool dump_log_on_failure = false) - : global_arena_{upstream_mr, initial_size, maximum_size}, - dump_log_on_failure_{dump_log_on_failure} + std::optional arena_size = std::nullopt, + bool dump_log_on_failure = false) + : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure} { if (dump_log_on_failure_) { logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log"); + // Set the level to `debug` for more detailed output. + logger_->set_level(spdlog::level::info); } } @@ -125,17 +120,15 @@ class arena_memory_resource final : public device_memory_resource { bool supports_get_mem_info() const noexcept override { return false; } private: - using global_arena = detail::arena::global_arena; - using arena = detail::arena::arena; - using read_lock = std::shared_lock; - using write_lock = std::lock_guard; + using global_arena = rmm::mr::detail::arena::global_arena; + using arena = rmm::mr::detail::arena::arena; /** * @brief Allocates memory of size at least `bytes`. * * The returned pointer has at least 256-byte alignment. * - * @throws `std::bad_alloc` if the requested allocation could not be fulfilled. + * @throws `rmm::out_of_memory` if no more memory is available for the requested size. * * @param bytes The size in bytes of the allocation. * @param stream The stream to associate this allocation with. @@ -144,22 +137,43 @@ class arena_memory_resource final : public device_memory_resource { void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { if (bytes <= 0) { return nullptr; } +#ifdef RMM_ARENA_USE_SIZE_CLASSES + bytes = rmm::mr::detail::arena::align_to_size_class(bytes); +#else + bytes = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT); +#endif + auto& arena = get_arena(stream); - bytes = detail::arena::align_up(bytes); - auto& arena = get_arena(stream); - void* pointer = arena.allocate(bytes); + { + std::shared_lock lock(mtx_); + void* pointer = arena.allocate(bytes); + if (pointer != nullptr) { return pointer; } + } - if (pointer == nullptr) { - write_lock lock(mtx_); + { + std::unique_lock lock(mtx_); defragment(); - pointer = arena.allocate(bytes); + void* pointer = arena.allocate(bytes); if (pointer == nullptr) { if (dump_log_on_failure_) { dump_memory_log(bytes); } RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory); } + return pointer; } + } - return pointer; + /** + * @brief Defragment memory by returning all superblocks to the global arena. + */ + void defragment() + { + RMM_CUDA_TRY(cudaDeviceSynchronize()); + for (auto& thread_arena : thread_arenas_) { + thread_arena.second->clean(); + } + for (auto& stream_arena : stream_arenas_) { + stream_arena.second.clean(); + } } /** @@ -167,29 +181,56 @@ class arena_memory_resource final : public device_memory_resource { * * @param ptr Pointer to be deallocated. * @param bytes The size in bytes of the allocation. This must be equal to the - * value of `bytes` that was passed to the `allocate` call that returned `p`. + * value of `bytes` that was passed to the `allocate` call that returned `ptr`. * @param stream Stream on which to perform deallocation. */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { if (ptr == nullptr || bytes <= 0) { return; } +#ifdef RMM_ARENA_USE_SIZE_CLASSES + bytes = rmm::mr::detail::arena::align_to_size_class(bytes); +#else + bytes = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT); +#endif + auto& arena = get_arena(stream); + + { + std::shared_lock lock(mtx_); + // If the memory being freed does not belong to the arena, the following will return false. + if (arena.deallocate(ptr, bytes, stream)) { return; } + } + + { + // Since we are returning this memory to another stream, we need to make sure the current + // stream is caught up. + stream.synchronize_no_throw(); - bytes = detail::arena::align_up(bytes); - get_arena(stream).deallocate(ptr, bytes, stream); + std::unique_lock lock(mtx_); + deallocate_from_other_arena(ptr, bytes, stream); + } } /** - * @brief Defragment memory by returning all free blocks to the global arena. + * @brief Deallocate memory pointed to by `ptr` that was allocated in a different arena. + * + * @param ptr Pointer to be deallocated. + * @param bytes The size in bytes of the allocation. This must be equal to the + * value of `bytes` that was passed to the `allocate` call that returned `ptr`. + * @param stream Stream on which to perform deallocation. */ - void defragment() + void deallocate_from_other_arena(void* ptr, std::size_t bytes, cuda_stream_view stream) { - RMM_CUDA_TRY(cudaDeviceSynchronize()); - for (auto& thread_arena : thread_arenas_) { - thread_arena.second->clean(); - } - for (auto& stream_arena : stream_arenas_) { - stream_arena.second.clean(); + if (use_per_thread_arena(stream)) { + for (auto const& thread_arena : thread_arenas_) { + if (thread_arena.second->deallocate(ptr, bytes)) { return; } + } + } else { + for (auto& stream_arena : stream_arenas_) { + if (stream_arena.second.deallocate(ptr, bytes)) { return; } + } } + + if (!global_arena_.deallocate(ptr, bytes)) { RMM_FAIL("allocation not found"); } } /** @@ -213,12 +254,12 @@ class arena_memory_resource final : public device_memory_resource { { auto const thread_id = std::this_thread::get_id(); { - read_lock lock(mtx_); + std::shared_lock lock(map_mtx_); auto const iter = thread_arenas_.find(thread_id); if (iter != thread_arenas_.end()) { return *iter->second; } } { - write_lock lock(mtx_); + std::unique_lock lock(map_mtx_); auto thread_arena = std::make_shared(global_arena_); thread_arenas_.emplace(thread_id, thread_arena); thread_local detail::arena::arena_cleaner cleaner{thread_arena}; @@ -235,12 +276,12 @@ class arena_memory_resource final : public device_memory_resource { { RMM_LOGGING_ASSERT(!use_per_thread_arena(stream)); { - read_lock lock(mtx_); + std::shared_lock lock(map_mtx_); auto const iter = stream_arenas_.find(stream.value()); if (iter != stream_arenas_.end()) { return iter->second; } } { - write_lock lock(mtx_); + std::unique_lock lock(map_mtx_); stream_arenas_.emplace(stream.value(), global_arena_); return stream_arenas_.at(stream.value()); } @@ -269,18 +310,6 @@ class arena_memory_resource final : public device_memory_resource { logger_->info("**************************************************"); logger_->info("Global arena:"); global_arena_.dump_memory_log(logger_); - logger_->info("Per-thread arenas:"); - for (auto const& thread_arena : thread_arenas_) { - logger_->info(" Thread {}:", thread_arena.first); - thread_arena.second->dump_memory_log(logger_); - } - if (!stream_arenas_.empty()) { - logger_->info("Per-stream arenas:"); - for (auto const& stream_arena : stream_arenas_) { - logger_->info(" Stream {}:", static_cast(stream_arena.first)); - stream_arena.second.dump_memory_log(logger_); - } - } logger_->flush(); } @@ -304,11 +333,13 @@ class arena_memory_resource final : public device_memory_resource { /// Implementation note: for small sizes, map is more efficient than unordered_map. std::map stream_arenas_; /// If true, dump memory information to log on allocation failure. - bool dump_log_on_failure_; + bool dump_log_on_failure_{}; /// The logger for memory dump. std::shared_ptr logger_{}; - /// Mutex for read and write locks. - mutable std::shared_timed_mutex mtx_; + /// Mutex for read and write locks on arena maps. + mutable std::shared_mutex map_mtx_; + /// Mutex for shared and unique locks on the mr. + mutable std::shared_mutex mtx_; }; } // namespace rmm::mr diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp index cda5a73ad..c0e5df377 100644 --- a/include/rmm/mr/device/detail/arena.hpp +++ b/include/rmm/mr/device/detail/arena.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -32,216 +33,455 @@ #include #include #include +#include #include -#include namespace rmm::mr::detail::arena { -/// Minimum size of a superblock (256 KiB). -constexpr std::size_t minimum_superblock_size = 1U << 18U; - /** - * @brief Represents a chunk of memory that can be allocated and deallocated. + * @brief Align up to nearest size class. * - * A block bigger than a certain size is called a "superblock". + * @param[in] value value to align. + * @return Return the aligned value. */ -class block { - public: - /** - * @brief Construct a default block. - */ - block() = default; +inline std::size_t align_to_size_class(std::size_t value) noexcept +{ + // See http://jemalloc.net/jemalloc.3.html. + // NOLINTBEGIN(readability-magic-numbers,cppcoreguidelines-avoid-magic-numbers) + static std::array size_classes{ + // clang-format off + // Spacing 256: + 256UL, 512UL, 768UL, 1024UL, 1280UL, 1536UL, 1792UL, 2048UL, + // Spacing 512: + 2560UL, 3072UL, 3584UL, 4096UL, + // Spacing 1 KiB: + 5UL << 10, 6UL << 10, 7UL << 10, 8UL << 10, + // Spacing 2 KiB: + 10UL << 10, 12UL << 10, 14UL << 10, 16UL << 10, + // Spacing 4 KiB: + 20UL << 10, 24UL << 10, 28UL << 10, 32UL << 10, + // Spacing 8 KiB: + 40UL << 10, 48UL << 10, 54UL << 10, 64UL << 10, + // Spacing 16 KiB: + 80UL << 10, 96UL << 10, 112UL << 10, 128UL << 10, + // Spacing 32 KiB: + 160UL << 10, 192UL << 10, 224UL << 10, 256UL << 10, + // Spacing 64 KiB: + 320UL << 10, 384UL << 10, 448UL << 10, 512UL << 10, + // Spacing 128 KiB: + 640UL << 10, 768UL << 10, 896UL << 10, 1UL << 20, + // Spacing 256 KiB: + 1280UL << 10, 1536UL << 10, 1792UL << 10, 2UL << 20, + // Spacing 512 KiB: + 2560UL << 10, 3UL << 20, 3584UL << 10, 4UL << 20, + // Spacing 1 MiB: + 5UL << 20, 6UL << 20, 7UL << 20, 8UL << 20, + // Spacing 2 MiB: + 10UL << 20, 12UL << 20, 14UL << 20, 16UL << 20, + // Spacing 4 MiB: + 20UL << 20, 24UL << 20, 28UL << 20, 32UL << 20, + // Spacing 8 MiB: + 40UL << 20, 48UL << 20, 56UL << 20, 64UL << 20, + // Spacing 16 MiB: + 80UL << 20, 96UL << 20, 112UL << 20, 128UL << 20, + // Spacing 32 MiB: + 160UL << 20, 192UL << 20, 224UL << 20, 256UL << 20, + // Spacing 64 MiB: + 320UL << 20, 384UL << 20, 448UL << 20, 512UL << 20, + // Spacing 128 MiB: + 640UL << 20, 768UL << 20, 896UL << 20, 1UL << 30, + // Spacing 256 MiB: + 1280UL << 20, 1536UL << 20, 1792UL << 20, 2UL << 30, + // Spacing 512 MiB: + 2560UL << 20, 3UL << 30, 3584UL << 20, 4UL << 30, + // Spacing 1 GiB: + 5UL << 30, 6UL << 30, 7UL << 30, 8UL << 30, + // Spacing 2 GiB: + 10UL << 30, 12UL << 30, 14UL << 30, 16UL << 30, + // Spacing 4 GiB: + 20UL << 30, 24UL << 30, 28UL << 30, 32UL << 30, + // Spacing 8 GiB: + 40UL << 30, 48UL << 30, 56UL << 30, 64UL << 30, + // Spacing 16 GiB: + 80UL << 30, 96UL << 30, 112UL << 30, 128UL << 30, + // Spacing 32 Gib: + 160UL << 30, 192UL << 30, 224UL << 30, 256UL << 30, + // Catch all: + std::numeric_limits::max() + // clang-format on + }; + // NOLINTEND(readability-magic-numbers,cppcoreguidelines-avoid-magic-numbers) + + auto* bound = std::lower_bound(size_classes.begin(), size_classes.end(), value); + RMM_LOGGING_ASSERT(bound != size_classes.end()); + return *bound; +} +/** + * @brief Represents a contiguous region of memory. + */ +class byte_span { + public: /** - * @brief Construct a block given a pointer and size. - * - * @param pointer The address for the beginning of the block. - * @param size The size of the block. + * @brief Construct a default span. */ - block(char* pointer, std::size_t size) : pointer_(pointer), size_(size) {} + byte_span() = default; /** - * @brief Construct a block given a void pointer and size. + * @brief Construct a span given a pointer and size. * - * @param pointer The address for the beginning of the block. - * @param size The size of the block. + * @param pointer The address for the beginning of the span. + * @param size The size of the span. */ - block(void* pointer, std::size_t size) : pointer_(static_cast(pointer)), size_(size) {} + byte_span(void* pointer, std::size_t size) : pointer_{static_cast(pointer)}, size_{size} + { + RMM_LOGGING_ASSERT(pointer != nullptr); + RMM_LOGGING_ASSERT(size > 0); + } /// Returns the underlying pointer. - [[nodiscard]] void* pointer() const { return pointer_; } + [[nodiscard]] char* pointer() const { return pointer_; } - /// Returns the size of the block. + /// Returns the size of the span. [[nodiscard]] std::size_t size() const { return size_; } - /// Returns true if this block is valid (non-null), false otherwise. - [[nodiscard]] bool is_valid() const { return pointer_ != nullptr; } + /// Returns the end of the span. + [[nodiscard]] char* end() const + { + return pointer_ + size_; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) + } + + /// Returns true if this span is valid (non-null), false otherwise. + [[nodiscard]] bool is_valid() const { return pointer_ != nullptr && size_ > 0; } + + /// Used by std::set to compare spans. + bool operator<(byte_span const& span) const + { + RMM_LOGGING_ASSERT(span.is_valid()); + return pointer_ < span.pointer_; + } - /// Returns true if this block is a superblock, false otherwise. - [[nodiscard]] bool is_superblock() const { return size_ >= minimum_superblock_size; } + private: + char* pointer_{}; ///< Raw memory pointer. + std::size_t size_{}; ///< Size in bytes. +}; + +/// Calculate the total size of a set of spans. +template +inline auto total_memory_size(std::set const& spans) +{ + return std::accumulate( + spans.cbegin(), spans.cend(), std::size_t{}, [](auto const& lhs, auto const& rhs) { + return lhs + rhs.size(); + }); +} + +/** + * @brief Represents a chunk of memory that can be allocated and deallocated. + */ +class block final : public byte_span { + public: + using byte_span::byte_span; /** - * @brief Verifies whether this block can be merged to the beginning of block b. + * @brief Is this block large enough to fit `bytes` bytes? * - * @param b The block to check for contiguity. - * @return true Returns true if this block's `pointer` + `size` == `b.ptr`, and `not b.is_head`, - false otherwise. + * @param bytes The size in bytes to check for fit. + * @return true if this block is at least `bytes` bytes. */ - [[nodiscard]] bool is_contiguous_before(block const& blk) const + [[nodiscard]] bool fits(std::size_t bytes) const { - // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) - return pointer_ + size_ == blk.pointer_; + RMM_LOGGING_ASSERT(is_valid()); + RMM_LOGGING_ASSERT(bytes > 0); + return size() >= bytes; } /** - * @brief Is this block large enough to fit `sz` bytes? + * @brief Verifies whether this block can be merged to the beginning of block blk. * - * @param size The size in bytes to check for fit. - * @return true if this block is at least `sz` bytes. + * @param blk The block to check for contiguity. + * @return true Returns true if this block's `pointer` + `size` == `blk.pointer`. */ - [[nodiscard]] bool fits(std::size_t size) const { return size_ >= size; } + [[nodiscard]] bool is_contiguous_before(block const& blk) const + { + RMM_LOGGING_ASSERT(is_valid()); + RMM_LOGGING_ASSERT(blk.is_valid()); + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + return pointer() + size() == blk.pointer(); + } /** * @brief Split this block into two by the given size. * - * @param size The size in bytes of the first block. - * @return std::pair A pair of blocks split by sz. + * @param bytes The size in bytes of the first block. + * @return std::pair A pair of blocks split by bytes. */ - [[nodiscard]] std::pair split(std::size_t size) const + [[nodiscard]] std::pair split(std::size_t bytes) const { - RMM_LOGGING_ASSERT(size_ >= size); + RMM_LOGGING_ASSERT(is_valid()); + RMM_LOGGING_ASSERT(size() > bytes); // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) - if (size_ > size) { return {{pointer_, size}, {pointer_ + size, size_ - size}}; } - return {*this, {}}; + return {{pointer(), bytes}, {pointer() + bytes, size() - bytes}}; } /** * @brief Coalesce two contiguous blocks into one. * - * `this->is_contiguous_before(b)` must be true. + * `this->is_contiguous_before(blk)` must be true. * - * @param b block to merge. + * @param blk block to merge. * @return block The merged block. */ [[nodiscard]] block merge(block const& blk) const { RMM_LOGGING_ASSERT(is_contiguous_before(blk)); - return {pointer_, size_ + blk.size_}; + return {pointer(), size() + blk.size()}; } - - /// Used by std::set to compare blocks. - bool operator<(block const& blk) const { return pointer_ < blk.pointer_; } - - private: - char* pointer_{}; ///< Raw memory pointer. - std::size_t size_{}; ///< Size in bytes. }; -inline bool block_size_compare(block lhs, block rhs) { return lhs.size() < rhs.size(); } - -/** - * @brief Align up to the allocation alignment. - * - * @param[in] v value to align - * @return Return the aligned value - */ -constexpr std::size_t align_up(std::size_t value) noexcept +/// Comparison function for block sizes. +inline bool block_size_compare(block const& lhs, block const& rhs) { - return rmm::detail::align_up(value, rmm::detail::CUDA_ALLOCATION_ALIGNMENT); + RMM_LOGGING_ASSERT(lhs.is_valid()); + RMM_LOGGING_ASSERT(rhs.is_valid()); + return lhs.size() < rhs.size(); } /** - * @brief Align down to the allocation alignment. - * - * @param[in] v value to align - * @return Return the aligned value + * @brief Represents a large chunk of memory that is exchanged between the global arena and + * per-thread arenas. */ -constexpr std::size_t align_down(std::size_t value) noexcept -{ - return rmm::detail::align_down(value, rmm::detail::CUDA_ALLOCATION_ALIGNMENT); -} +class superblock final : public byte_span { + public: + /// Minimum size of a superblock (1 MiB). + static constexpr std::size_t minimum_size{1UL << 20}; + /// Maximum size of a superblock (1 TiB), as a sanity check. + static constexpr std::size_t maximum_size{1UL << 40}; -/** - * @brief Get the first free block of at least `size` bytes. - * - * Address-ordered first-fit has shown to perform slightly better than best-fit when it comes to - * memory fragmentation, and slightly cheaper to implement. It is also used by some popular - * allocators such as jemalloc. - * - * \see Johnstone, M. S., & Wilson, P. R. (1998). The memory fragmentation problem: Solved?. ACM - * Sigplan Notices, 34(3), 26-36. - * - * @param free_blocks The address-ordered set of free blocks. - * @param size The number of bytes to allocate. - * @return block A block of memory of at least `size` bytes, or an empty block if not found. - */ -inline block first_fit(std::set& free_blocks, std::size_t size) -{ - auto fits = [size](auto const& blk) { return blk.fits(size); }; + /** + * @brief Construct a default superblock. + */ + superblock() = default; - auto const iter = std::find_if(free_blocks.cbegin(), free_blocks.cend(), fits); + /** + * @brief Construct a superblock given a pointer and size. + * + * @param pointer The address for the beginning of the superblock. + * @param size The size of the superblock. + */ + superblock(void* pointer, std::size_t size) : byte_span{pointer, size} + { + RMM_LOGGING_ASSERT(size >= minimum_size); + RMM_LOGGING_ASSERT(size <= maximum_size); + free_blocks_.emplace(pointer, size); + } - if (iter == free_blocks.cend()) { return {}; } - // Remove the block from the free_list. - auto const blk = *iter; - auto const next = free_blocks.erase(iter); + // Disable copy semantics. + superblock(superblock const&) = delete; + superblock& operator=(superblock const&) = delete; + // Allow move semantics. + superblock(superblock&&) noexcept = default; + superblock& operator=(superblock&&) noexcept = default; - if (blk.size() > size) { - // Split the block and put the remainder back. - auto const split = blk.split(size); - free_blocks.insert(next, split.second); - return split.first; + ~superblock() = default; + + /** + * @brief Is this superblock empty? + * + * @return true if this superblock is empty. + */ + [[nodiscard]] bool empty() const + { + RMM_LOGGING_ASSERT(is_valid()); + return free_blocks_.size() == 1 && free_blocks_.cbegin()->size() == size(); } - return blk; -} -/** - * @brief Coalesce the given block with other free blocks. - * - * @param free_blocks The address-ordered set of free blocks. - * @param b The block to coalesce. - * @return block The coalesced block. - */ -inline block coalesce_block(std::set& free_blocks, block const& blk) + /** + * @brief Return the number of free blocks. + * + * @return the number of free blocks. + */ + [[nodiscard]] std::size_t free_blocks() const + { + RMM_LOGGING_ASSERT(is_valid()); + return free_blocks_.size(); + } + + /** + * @brief Whether this superblock contains the given block. + * + * @param blk The block to search for. + * @return true if the given block belongs to this superblock. + */ + [[nodiscard]] bool contains(block const& blk) const + { + RMM_LOGGING_ASSERT(is_valid()); + RMM_LOGGING_ASSERT(blk.is_valid()); + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + return pointer() <= blk.pointer() && pointer() + size() >= blk.pointer() + blk.size(); + } + + /** + * @brief Can this superblock fit `bytes` bytes? + * + * @param bytes The size in bytes to check for fit. + * @return true if this superblock can fit `bytes` bytes. + */ + [[nodiscard]] bool fits(std::size_t bytes) const + { + RMM_LOGGING_ASSERT(is_valid()); + return std::any_of(free_blocks_.cbegin(), free_blocks_.cend(), [bytes](auto const& blk) { + return blk.fits(bytes); + }); + } + + /** + * @brief Verifies whether this superblock can be merged to the beginning of superblock s. + * + * @param s The superblock to check for contiguity. + * @return true Returns true if both superblocks are empty and this superblock's + * `pointer` + `size` == `s.ptr`. + */ + [[nodiscard]] bool is_contiguous_before(superblock const& sblk) const + { + RMM_LOGGING_ASSERT(is_valid()); + RMM_LOGGING_ASSERT(sblk.is_valid()); + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + return empty() && sblk.empty() && pointer() + size() == sblk.pointer(); + } + + /** + * @brief Split this superblock into two by the given size. + * + * @param bytes The size in bytes of the first block. + * @return superblock_pair A pair of superblocks split by bytes. + */ + [[nodiscard]] std::pair split(std::size_t bytes) const + { + RMM_LOGGING_ASSERT(is_valid()); + RMM_LOGGING_ASSERT(empty() && bytes >= minimum_size && size() >= bytes + minimum_size); + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + return {superblock{pointer(), bytes}, superblock{pointer() + bytes, size() - bytes}}; + } + + /** + * @brief Coalesce two contiguous superblocks into one. + * + * `this->is_contiguous_before(s)` must be true. + * + * @param sblk superblock to merge. + * @return block The merged block. + */ + [[nodiscard]] superblock merge(superblock const& sblk) const + { + RMM_LOGGING_ASSERT(is_contiguous_before(sblk)); + return {pointer(), size() + sblk.size()}; + } + + /** + * @brief Get the first free block of at least `size` bytes. + * + * @param size The number of bytes to allocate. + * @return block A block of memory of at least `size` bytes, or an empty block if not found. + */ + block first_fit(std::size_t size) + { + RMM_LOGGING_ASSERT(is_valid()); + RMM_LOGGING_ASSERT(size > 0); + + auto fits = [size](auto const& blk) { return blk.fits(size); }; + auto const iter = std::find_if(free_blocks_.cbegin(), free_blocks_.cend(), fits); + if (iter == free_blocks_.cend()) { return {}; } + + // Remove the block from the free list. + auto const blk = *iter; + auto const next = free_blocks_.erase(iter); + + if (blk.size() > size) { + // Split the block and put the remainder back. + auto const split = blk.split(size); + free_blocks_.insert(next, split.second); + return split.first; + } + return blk; + } + + /** + * @brief Coalesce the given block with other free blocks. + * + * @param blk The block to coalesce. + */ + void coalesce(block const& blk) // NOLINT(readability-function-cognitive-complexity) + { + RMM_LOGGING_ASSERT(is_valid()); + RMM_LOGGING_ASSERT(blk.is_valid()); + RMM_LOGGING_ASSERT(contains(blk)); + + // Find the right place (in ascending address order) to insert the block. + auto const next = free_blocks_.lower_bound(blk); + auto const previous = next == free_blocks_.cbegin() ? next : std::prev(next); + + // Coalesce with neighboring blocks. + bool const merge_prev = previous != free_blocks_.cend() && previous->is_contiguous_before(blk); + bool const merge_next = next != free_blocks_.cend() && blk.is_contiguous_before(*next); + + if (merge_prev && merge_next) { + auto const merged = previous->merge(blk).merge(*next); + free_blocks_.erase(previous); + auto const iter = free_blocks_.erase(next); + free_blocks_.insert(iter, merged); + } else if (merge_prev) { + auto const merged = previous->merge(blk); + auto const iter = free_blocks_.erase(previous); + free_blocks_.insert(iter, merged); + } else if (merge_next) { + auto const merged = blk.merge(*next); + auto const iter = free_blocks_.erase(next); + free_blocks_.insert(iter, merged); + } else { + free_blocks_.insert(next, blk); + } + } + + /** + * @brief Find the total free block size. + * @return the total free block size. + */ + [[nodiscard]] std::size_t total_free_size() const { return total_memory_size(free_blocks_); } + + /** + * @brief Find the max free block size. + * @return the max free block size. + */ + [[nodiscard]] std::size_t max_free_size() const + { + if (free_blocks_.empty()) { return 0; } + return std::max_element(free_blocks_.cbegin(), free_blocks_.cend(), block_size_compare)->size(); + } + + private: + /// Address-ordered set of free blocks. + std::set free_blocks_{}; +}; + +/// Calculate the total free size of a set of superblocks. +inline auto total_free_size(std::set const& superblocks) { - if (!blk.is_valid()) { return blk; } - - // Find the right place (in ascending address order) to insert the block. - auto const next = free_blocks.lower_bound(blk); - auto const previous = next == free_blocks.cbegin() ? next : std::prev(next); - - // Coalesce with neighboring blocks. - bool const merge_prev = previous->is_contiguous_before(blk); - bool const merge_next = next != free_blocks.cend() && blk.is_contiguous_before(*next); - - block merged{}; - if (merge_prev && merge_next) { - merged = previous->merge(blk).merge(*next); - free_blocks.erase(previous); - auto const iter = free_blocks.erase(next); - free_blocks.insert(iter, merged); - } else if (merge_prev) { - merged = previous->merge(blk); - auto const iter = free_blocks.erase(previous); - free_blocks.insert(iter, merged); - } else if (merge_next) { - merged = blk.merge(*next); - auto const iter = free_blocks.erase(next); - free_blocks.insert(iter, merged); - } else { - free_blocks.emplace(blk); - merged = blk; - } - return merged; + return std::accumulate( + superblocks.cbegin(), superblocks.cend(), std::size_t{}, [](auto const& lhs, auto const& rhs) { + return lhs + rhs.total_free_size(); + }); } -template -inline auto total_block_size(T const& blocks) +/// Find the max free size from a set of superblocks. +inline auto max_free_size(std::set const& superblocks) { - return std::accumulate(blocks.cbegin(), blocks.cend(), std::size_t{}, [](auto lhs, auto rhs) { - return lhs + rhs.size(); - }); -} + std::size_t size{}; + for (auto const& sblk : superblocks) { + size = std::max(size, sblk.max_free_size()); + } + return size; +}; /** * @brief The global arena for allocating memory from the upstream memory resource. @@ -254,51 +494,24 @@ inline auto total_block_size(T const& blocks) template class global_arena final { public: - /// The default initial size for the global arena. - static constexpr std::size_t default_initial_size = std::numeric_limits::max(); - /// The default maximum size for the global arena. - static constexpr std::size_t default_maximum_size = std::numeric_limits::max(); - /// Reserved memory that should not be allocated (64 MiB). - static constexpr std::size_t reserved_size = 1U << 26U; - /** * @brief Construct a global arena. * * @throws rmm::logic_error if `upstream_mr == nullptr`. - * @throws rmm::logic_error if `initial_size` is neither the default nor aligned to a multiple of - * 256 bytes. - * @throws rmm::logic_error if `maximum_size` is neither the default nor aligned to a multiple of - * 256 bytes. * * @param upstream_mr The memory resource from which to allocate blocks for the pool - * @param initial_size Minimum size, in bytes, of the initial global arena. Defaults to half of - * the available memory on the current device. - * @param maximum_size Maximum size, in bytes, that the global arena can grow to. Defaults to all - * of the available memory on the current device. + * @param arena_size Size in bytes of the global arena. Defaults to half of the available memory + * on the current device. */ - global_arena(Upstream* upstream_mr, std::size_t initial_size, std::size_t maximum_size) - : upstream_mr_{upstream_mr}, maximum_size_{maximum_size} + global_arena(Upstream* upstream_mr, std::optional arena_size) + : upstream_mr_{upstream_mr} { RMM_EXPECTS(nullptr != upstream_mr_, "Unexpected null upstream pointer."); - RMM_EXPECTS(initial_size == default_initial_size || initial_size == align_up(initial_size), - "Error, Initial arena size required to be a multiple of 256 bytes"); - RMM_EXPECTS(maximum_size_ == default_maximum_size || maximum_size_ == align_up(maximum_size_), - "Error, Maximum arena size required to be a multiple of 256 bytes"); - - if (initial_size == default_initial_size || maximum_size == default_maximum_size) { - std::size_t free{}; - std::size_t total{}; - RMM_CUDA_TRY(cudaMemGetInfo(&free, &total)); - if (initial_size == default_initial_size) { - initial_size = align_up(std::min(free, total / 2)); - } - if (maximum_size_ == default_maximum_size) { - maximum_size_ = align_down(free) - reserved_size; - } - } - RMM_EXPECTS(initial_size <= maximum_size_, "Initial arena size exceeds the maximum pool size!"); - - free_blocks_.emplace(expand_arena(initial_size)); + auto const size = rmm::detail::align_down(arena_size.value_or(default_size()), + rmm::detail::CUDA_ALLOCATION_ALIGNMENT); + RMM_EXPECTS(size >= superblock::minimum_size, + "Arena size smaller than minimum superblock size."); + initialize(size); } // Disable copy (and move) semantics. @@ -313,48 +526,120 @@ class global_arena final { */ ~global_arena() { - lock_guard lock(mtx_); - for (auto const& blk : upstream_blocks_) { - upstream_mr_->deallocate(blk.pointer(), blk.size()); - } + std::lock_guard lock(mtx_); + upstream_mr_->deallocate(upstream_block_.pointer(), upstream_block_.size()); + } + + /** + * @brief Should allocation of `size` bytes be handled by the global arena directly? + * + * @param size The size in bytes of the allocation. + * @return bool True if the allocation should be handled by the global arena. + */ + bool handles(std::size_t size) const { return size > superblock::minimum_size; } + + /** + * @brief Acquire a superblock that can fit a block of the given size. + * + * @param size The size in bytes of the allocation. + * @return superblock The acquired superblock. + */ + superblock acquire(std::size_t size) + { + // Superblocks should only be acquired if the size is not directly handled by the global arena. + RMM_LOGGING_ASSERT(!handles(size)); + std::lock_guard lock(mtx_); + return first_fit(size); + } + + /** + * @brief Release a superblock. + * + * @param s Superblock to be released. + */ + void release(superblock&& sblk) + { + RMM_LOGGING_ASSERT(sblk.is_valid()); + std::lock_guard lock(mtx_); + coalesce(std::move(sblk)); } /** - * @brief Allocates memory of size at least `bytes`. + * @brief Release a set of superblocks from a dying arena. * - * @throws `std::bad_alloc` if the requested allocation could not be fulfilled. + * @param superblocks The set of superblocks. + */ + void release(std::set& superblocks) + { + std::lock_guard lock(mtx_); + while (!superblocks.empty()) { + auto sblk = std::move(superblocks.extract(superblocks.cbegin()).value()); + RMM_LOGGING_ASSERT(sblk.is_valid()); + coalesce(std::move(sblk)); + } + } + + /** + * @brief Allocate a large block directly. * - * @param bytes The size in bytes of the allocation. + * @param size The size in bytes of the allocation. * @return void* Pointer to the newly allocated memory. */ - block allocate(std::size_t bytes) + void* allocate(std::size_t size) { - lock_guard lock(mtx_); - return get_block(bytes); + RMM_LOGGING_ASSERT(handles(size)); + std::lock_guard lock(mtx_); + auto sblk = first_fit(size); + if (sblk.is_valid()) { + auto blk = sblk.first_fit(size); + superblocks_.insert(std::move(sblk)); + return blk.pointer(); + } + return nullptr; } /** - * @brief Deallocate memory pointed to by `blk`. + * @brief Deallocate memory pointed to by `ptr`. * - * @param blk Block to be deallocated. + * @param ptr Pointer to be deallocated. + * @param size The size in bytes of the allocation. This must be equal to the value of `size` + * that was passed to the `allocate` call that returned `p`. + * @param stream Stream on which to perform deallocation. + * @return bool true if the allocation is found, false otherwise. */ - void deallocate(block const& blk) + bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream) { - lock_guard lock(mtx_); - coalesce_block(free_blocks_, blk); + RMM_LOGGING_ASSERT(handles(size)); + stream.synchronize_no_throw(); + return deallocate(ptr, size); } /** - * @brief Deallocate a set of free blocks from a dying arena. + * @brief Deallocate memory pointed to by `ptr`. * - * @param free_blocks The set of free blocks. + * @param ptr Pointer to be deallocated. + * @param bytes The size in bytes of the allocation. This must be equal to the + * value of `bytes` that was passed to the `allocate` call that returned `ptr`. + * @return bool true if the allocation is found, false otherwise. */ - void deallocate(std::set const& free_blocks) + bool deallocate(void* ptr, std::size_t bytes) { - lock_guard lock(mtx_); - for (auto const& blk : free_blocks) { - coalesce_block(free_blocks_, blk); + std::lock_guard lock(mtx_); + + block const blk{ptr, bytes}; + auto const iter = std::find_if(superblocks_.cbegin(), + superblocks_.cend(), + [&](auto const& sblk) { return sblk.contains(blk); }); + if (iter == superblocks_.cend()) { return false; } + + auto sblk = std::move(superblocks_.extract(iter).value()); + sblk.coalesce(blk); + if (sblk.empty()) { + coalesce(std::move(sblk)); + } else { + superblocks_.insert(std::move(sblk)); } + return true; } /** @@ -364,86 +649,136 @@ class global_arena final { */ void dump_memory_log(std::shared_ptr const& logger) const { - lock_guard lock(mtx_); - - logger->info(" Maximum size: {}", rmm::detail::bytes{maximum_size_}); - logger->info(" Current size: {}", rmm::detail::bytes{current_size_}); - - logger->info(" # free blocks: {}", free_blocks_.size()); - if (!free_blocks_.empty()) { - logger->info(" Total size of free blocks: {}", - rmm::detail::bytes{total_block_size(free_blocks_)}); - auto const largest_free = - *std::max_element(free_blocks_.begin(), free_blocks_.end(), block_size_compare); - logger->info(" Size of largest free block: {}", rmm::detail::bytes{largest_free.size()}); + std::lock_guard lock(mtx_); + + logger->info(" Arena size: {}", rmm::detail::bytes{upstream_block_.size()}); + logger->info(" # superblocks: {}", superblocks_.size()); + if (!superblocks_.empty()) { + logger->debug(" Total size of superblocks: {}", + rmm::detail::bytes{total_memory_size(superblocks_)}); + auto const total_free = total_free_size(superblocks_); + auto const max_free = max_free_size(superblocks_); + auto const fragmentation = (1 - max_free / static_cast(total_free)) * 100; + logger->info(" Total free memory: {}", rmm::detail::bytes{total_free}); + logger->info(" Largest block of free memory: {}", rmm::detail::bytes{max_free}); + logger->info(" Fragmentation: {:.2f}%", fragmentation); + + auto index = 0; + char* prev_end{}; + for (auto const& sblk : superblocks_) { + if (prev_end == nullptr) { prev_end = sblk.pointer(); } + logger->debug( + " Superblock {}: start={}, end={}, size={}, empty={}, # free blocks={}, max free={}, " + "gap={}", + index, + fmt::ptr(sblk.pointer()), + fmt::ptr(sblk.end()), + rmm::detail::bytes{sblk.size()}, + sblk.empty(), + sblk.free_blocks(), + rmm::detail::bytes{sblk.max_free_size()}, + rmm::detail::bytes{static_cast(sblk.pointer() - prev_end)}); + prev_end = sblk.end(); + index++; + } } - - logger->info(" # upstream blocks={}", upstream_blocks_.size()); - logger->info(" Total size of upstream blocks: {}", - rmm::detail::bytes{total_block_size(upstream_blocks_)}); } private: - using lock_guard = std::lock_guard; + /** + * @brief Default size of the global arena if unspecified. + * @return the default global arena size. + */ + constexpr std::size_t default_size() const + { + auto const [free, total] = rmm::detail::available_device_memory(); + return free / 2; + } /** - * @brief Get an available memory block of at least `size` bytes. + * @brief Allocate space from upstream to initialize the arena. * - * @param size The number of bytes to allocate. - * @return block A block of memory of at least `size` bytes. + * @param size The size to allocate. */ - block get_block(std::size_t size) + void initialize(std::size_t size) { - // Find the first-fit free block. - auto const blk = first_fit(free_blocks_, size); - if (blk.is_valid()) { return blk; } - - // No existing larger blocks available, so grow the arena. - auto const upstream_block = expand_arena(size_to_grow(size)); - coalesce_block(free_blocks_, upstream_block); - return first_fit(free_blocks_, size); + upstream_block_ = {upstream_mr_->allocate(size), size}; + superblocks_.emplace(upstream_block_.pointer(), size); } /** - * @brief Get the size to grow the global arena given the requested `size` bytes. + * @brief Get the first superblock that can fit a block of at least `size` bytes. + * + * Address-ordered first-fit has shown to perform slightly better than best-fit when it comes to + * memory fragmentation, and slightly cheaper to implement. It is also used by some popular + * allocators such as jemalloc. * - * This simply grows the global arena to the maximum size. + * \see Johnstone, M. S., & Wilson, P. R. (1998). The memory fragmentation problem: Solved?. ACM + * Sigplan Notices, 34(3), 26-36. * - * @param size The number of bytes required. - * @return size The size for the arena to grow, or 0 if no more memory. + * @param size The number of bytes to allocate. + * @param minimum_size The minimum size of the superblock required. + * @return superblock A superblock that can fit at least `size` bytes, or empty if not found. */ - constexpr std::size_t size_to_grow(std::size_t size) const + superblock first_fit(std::size_t size) { - if (current_size_ + size > maximum_size_) { return 0; } - return maximum_size_ - current_size_; + auto const iter = std::find_if(superblocks_.cbegin(), + superblocks_.cend(), + [=](auto const& sblk) { return sblk.fits(size); }); + if (iter == superblocks_.cend()) { return {}; } + + auto sblk = std::move(superblocks_.extract(iter).value()); + auto const min_size = std::max(superblock::minimum_size, size); + if (sblk.empty() && sblk.size() >= min_size + superblock::minimum_size) { + // Split the superblock and put the remainder back. + auto [head, tail] = sblk.split(min_size); + superblocks_.insert(std::move(tail)); + return std::move(head); + } + return sblk; } /** - * @brief Allocate space from upstream to supply the arena and return a sufficiently sized block. + * @brief Coalesce the given superblock with other empty superblocks. * - * @param size The minimum size to allocate. - * @return block A block of at least `size` bytes. + * @param sblk The superblock to coalesce. */ - block expand_arena(std::size_t size) + void coalesce(superblock&& sblk) { - if (size > 0) { - upstream_blocks_.push_back({upstream_mr_->allocate(size), size}); - current_size_ += size; - return upstream_blocks_.back(); + RMM_LOGGING_ASSERT(sblk.is_valid()); + + // Find the right place (in ascending address order) to insert the block. + auto const next = superblocks_.lower_bound(sblk); + auto const previous = next == superblocks_.cbegin() ? next : std::prev(next); + + // Coalesce with neighboring blocks. + bool const merge_prev = previous != superblocks_.cend() && previous->is_contiguous_before(sblk); + bool const merge_next = next != superblocks_.cend() && sblk.is_contiguous_before(*next); + + if (merge_prev && merge_next) { + auto prev_sb = std::move(superblocks_.extract(previous).value()); + auto next_sb = std::move(superblocks_.extract(next).value()); + auto merged = prev_sb.merge(sblk).merge(next_sb); + superblocks_.insert(std::move(merged)); + } else if (merge_prev) { + auto prev_sb = std::move(superblocks_.extract(previous).value()); + auto merged = prev_sb.merge(sblk); + superblocks_.insert(std::move(merged)); + } else if (merge_next) { + auto next_sb = std::move(superblocks_.extract(next).value()); + auto merged = sblk.merge(next_sb); + superblocks_.insert(std::move(merged)); + } else { + superblocks_.insert(std::move(sblk)); } - return {}; } /// The upstream resource to allocate memory from. Upstream* upstream_mr_; - /// The maximum size the global arena can grow to. - std::size_t maximum_size_; - /// The current size of the global arena. - std::size_t current_size_{}; - /// Address-ordered set of free blocks. - std::set free_blocks_; - /// Blocks allocated from upstream so that they can be quickly freed. - std::vector upstream_blocks_; + /// Block allocated from upstream so that it can be quickly freed. + block upstream_block_; + /// Address-ordered set of superblocks. + std::set superblocks_; /// Mutex for exclusive lock. mutable std::mutex mtx_; }; @@ -452,7 +787,7 @@ class global_arena final { * @brief An arena for allocating memory for a thread. * * An arena is a per-thread or per-non-default-stream memory pool. It allocates - * superblocks from the global arena, and return them when the superblocks become empty. + * superblocks from the global arena, and returns them when the superblocks become empty. * * @tparam Upstream Memory resource to use for allocating the global arena. Implements * rmm::mr::device_memory_resource interface. @@ -467,78 +802,81 @@ class arena { */ explicit arena(global_arena& global_arena) : global_arena_{global_arena} {} - ~arena() = default; - // Disable copy (and move) semantics. arena(arena const&) = delete; arena& operator=(arena const&) = delete; arena(arena&&) noexcept = delete; arena& operator=(arena&&) noexcept = delete; + ~arena() = default; + /** - * @brief Allocates memory of size at least `bytes`. - * - * @throws `std::bad_alloc` if the requested allocation could not be fulfilled. + * @brief Allocates memory of size at least `size` bytes. * - * @param bytes The size in bytes of the allocation. + * @param size The size in bytes of the allocation. * @return void* Pointer to the newly allocated memory. */ - void* allocate(std::size_t bytes) + void* allocate(std::size_t size) { - lock_guard lock(mtx_); - auto const blk = get_block(bytes); - return blk.pointer(); + if (global_arena_.handles(size)) { return global_arena_.allocate(size); } + std::lock_guard lock(mtx_); + return get_block(size).pointer(); } /** * @brief Deallocate memory pointed to by `ptr`, and possibly return superblocks to upstream. * * @param ptr Pointer to be deallocated. - * @param bytes The size in bytes of the allocation. This must be equal to the value of `bytes` + * @param size The size in bytes of the allocation. This must be equal to the value of `size` * that was passed to the `allocate` call that returned `p`. * @param stream Stream on which to perform deallocation. + * @return bool true if the allocation is found, false otherwise. */ - void deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) + bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream) { - lock_guard lock(mtx_); - block const blk{ptr, bytes}; - auto const merged = coalesce_block(free_blocks_, blk); - shrink_arena(merged, stream); + if (global_arena_.handles(size) && global_arena_.deallocate(ptr, size, stream)) { return true; } + return deallocate(ptr, size); } /** - * @brief Clean the arena and deallocate free blocks from the global arena. + * @brief Deallocate memory pointed to by `ptr`, and possibly return superblocks to upstream. + * + * @param ptr Pointer to be deallocated. + * @param size The size in bytes of the allocation. This must be equal to the value of `size` + * that was passed to the `allocate` call that returned `p`. + * @return bool true if the allocation is found, false otherwise. + */ + bool deallocate(void* ptr, std::size_t size) + { + std::lock_guard lock(mtx_); + return deallocate_from_superblock({ptr, size}); + } + + /** + * @brief Clean the arena and release all superblocks to the global arena. */ void clean() { - lock_guard lock(mtx_); - global_arena_.deallocate(free_blocks_); - free_blocks_.clear(); + std::lock_guard lock(mtx_); + global_arena_.release(superblocks_); + superblocks_.clear(); } /** - * Dump memory to log. - * - * @param logger the spdlog logger to use + * @brief Defragment the arena and release empty superblock to the global arena. */ - void dump_memory_log(std::shared_ptr const& logger) const + void defragment() { - lock_guard lock(mtx_); - logger->info(" # free blocks: {}", free_blocks_.size()); - if (!free_blocks_.empty()) { - logger->info(" Total size of free blocks: {}", - rmm::detail::bytes{total_block_size(free_blocks_)}); - auto const largest_free = - *std::max_element(free_blocks_.begin(), free_blocks_.end(), block_size_compare); - logger->info(" Size of largest free block: {}", rmm::detail::bytes{largest_free.size()}); + std::lock_guard lock(mtx_); + while (true) { + auto const iter = std::find_if( + superblocks_.cbegin(), superblocks_.cend(), [](auto const& sblk) { return sblk.empty(); }); + if (iter == superblocks_.cend()) { return; } + global_arena_.release(std::move(superblocks_.extract(iter).value())); } } private: - using lock_guard = std::lock_guard; - /// Maximum number of free blocks to keep. - static constexpr int max_free_blocks = 16; - /** * @brief Get an available memory block of at least `size` bytes. * @@ -547,51 +885,82 @@ class arena { */ block get_block(std::size_t size) { - if (size < minimum_superblock_size) { - // Find the first-fit free block. - auto const blk = first_fit(free_blocks_, size); - if (blk.is_valid()) { return blk; } - } + // Find the first-fit free block. + auto const blk = first_fit(size); + if (blk.is_valid()) { return blk; } // No existing larger blocks available, so grow the arena and obtain a superblock. - auto const superblock = expand_arena(size); - if (superblock.is_valid()) { - coalesce_block(free_blocks_, superblock); - return first_fit(free_blocks_, size); - } - return superblock; + return expand_arena(size); } /** - * @brief Allocate space from upstream to supply the arena and return a superblock. + * @brief Get the first free block of at least `size` bytes. * - * @return A superblock. + * Address-ordered first-fit has shown to perform slightly better than best-fit when it comes to + * memory fragmentation, and slightly cheaper to implement. It is also used by some popular + * allocators such as jemalloc. + * + * \see Johnstone, M. S., & Wilson, P. R. (1998). The memory fragmentation problem: Solved?. ACM + * Sigplan Notices, 34(3), 26-36. + * + * @param size The number of bytes to allocate. + * @return block A block of memory of at least `size` bytes, or an empty block if not found. */ - block expand_arena(std::size_t size) + block first_fit(std::size_t size) + { + auto const iter = std::find_if(superblocks_.cbegin(), + superblocks_.cend(), + [size](auto const& sblk) { return sblk.fits(size); }); + if (iter == superblocks_.cend()) { return {}; } + + auto sblk = std::move(superblocks_.extract(iter).value()); + auto const blk = sblk.first_fit(size); + superblocks_.insert(std::move(sblk)); + return blk; + } + + /** + * @brief Deallocate a block from the superblock it belongs to. + * + * @param blk The block to deallocate. + * @param stream The stream to use for deallocation. + * @return true if the block is found. + */ + bool deallocate_from_superblock(block const& blk) { - auto const superblock_size = std::max(size, minimum_superblock_size); - return global_arena_.allocate(superblock_size); + auto const iter = std::find_if(superblocks_.cbegin(), + superblocks_.cend(), + [&](auto const& sblk) { return sblk.contains(blk); }); + if (iter == superblocks_.cend()) { return false; } + + auto sblk = std::move(superblocks_.extract(iter).value()); + sblk.coalesce(blk); + superblocks_.insert(std::move(sblk)); + return true; } /** - * @brief Shrink this arena by returning free superblocks to upstream. + * @brief Allocate space from upstream to supply the arena and return a block. * - * @param blk The block that can be used to shrink the arena. - * @param stream Stream on which to perform shrinking. + * @param size The number of bytes to allocate. + * @return block A block of memory of at least `size` bytes. */ - void shrink_arena(block const& blk, cuda_stream_view stream) + block expand_arena(std::size_t size) { - if (blk.is_superblock() || free_blocks_.size() > max_free_blocks) { - stream.synchronize_no_throw(); - global_arena_.deallocate(blk); - free_blocks_.erase(blk); + auto sblk = global_arena_.acquire(size); + if (sblk.is_valid()) { + RMM_LOGGING_ASSERT(sblk.size() >= superblock::minimum_size); + auto const blk = sblk.first_fit(size); + superblocks_.insert(std::move(sblk)); + return blk; } + return {}; } /// The global arena to allocate superblocks from. global_arena& global_arena_; - /// Free blocks. - std::set free_blocks_; + /// Acquired superblocks. + std::set superblocks_; /// Mutex for exclusive lock. mutable std::mutex mtx_; }; diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp index d9499802d..b86e2457c 100644 --- a/tests/mr/device/arena_mr_tests.cpp +++ b/tests/mr/device/arena_mr_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,51 +15,527 @@ */ #include "../../byte_literals.hpp" -#include "rmm/cuda_stream_view.hpp" - -#include - +#include +#include +#include #include #include -#include +#include #include +#include #include +#include #include #include #include namespace rmm::test { - namespace { -using cuda_mr = rmm::mr::cuda_memory_resource; -using arena_mr = rmm::mr::arena_memory_resource; -TEST(ArenaTest, ThrowOnNullUpstream) +class mock_memory_resource { + public: + MOCK_METHOD(void*, allocate, (std::size_t)); + MOCK_METHOD(void, deallocate, (void*, std::size_t)); +}; + +using rmm::mr::detail::arena::block; +using rmm::mr::detail::arena::byte_span; +using rmm::mr::detail::arena::superblock; +using global_arena = rmm::mr::detail::arena::global_arena; +using arena = rmm::mr::detail::arena::arena; +using arena_mr = rmm::mr::arena_memory_resource; +using ::testing::Return; + +// NOLINTBEGIN(cppcoreguidelines-pro-type-reinterpret-cast,performance-no-int-to-ptr) +auto const fake_address = reinterpret_cast(1_KiB); +auto const fake_address2 = reinterpret_cast(2_KiB); +auto const fake_address3 = reinterpret_cast(superblock::minimum_size); +auto const fake_address4 = reinterpret_cast(superblock::minimum_size * 2); +// NOLINTEND(cppcoreguidelines-pro-type-reinterpret-cast,performance-no-int-to-ptr) + +struct ArenaTest : public ::testing::Test { + void SetUp() override + { + EXPECT_CALL(mock_mr, allocate(arena_size)).WillOnce(Return(fake_address3)); + EXPECT_CALL(mock_mr, deallocate(fake_address3, arena_size)); + global = std::make_unique(&mock_mr, arena_size); + per_thread = std::make_unique(*global); + } + + std::size_t arena_size{superblock::minimum_size * 4}; + mock_memory_resource mock_mr{}; + std::unique_ptr global{}; + std::unique_ptr per_thread{}; +}; + +/** + * Test align_to_size_class. + */ +TEST_F(ArenaTest, AlignToSizeClass) // NOLINT +{ + using rmm::mr::detail::arena::align_to_size_class; + EXPECT_EQ(align_to_size_class(8), 256); + EXPECT_EQ(align_to_size_class(256), 256); + EXPECT_EQ(align_to_size_class(264), 512); + EXPECT_EQ(align_to_size_class(512), 512); + EXPECT_EQ(align_to_size_class(17_KiB), 20_KiB); + EXPECT_EQ(align_to_size_class(13_MiB), 14_MiB); + EXPECT_EQ(align_to_size_class(2500_MiB), 2560_MiB); + EXPECT_EQ(align_to_size_class(128_GiB), 128_GiB); + EXPECT_EQ(align_to_size_class(1_PiB), std::numeric_limits::max()); +} + +/** + * Test byte_span. + */ + +TEST_F(ArenaTest, ByteSpan) // NOLINT +{ + byte_span const span{}; + EXPECT_FALSE(span.is_valid()); + byte_span const span2{fake_address, 256}; + EXPECT_TRUE(span2.is_valid()); +} + +/** + * Test block. + */ + +TEST_F(ArenaTest, BlockFits) // NOLINT +{ + block const blk{fake_address, 1_KiB}; + EXPECT_TRUE(blk.fits(1_KiB)); + EXPECT_FALSE(blk.fits(1_KiB + 1)); +} + +TEST_F(ArenaTest, BlockIsContiguousBefore) // NOLINT +{ + block const blk{fake_address, 1_KiB}; + block const blk2{fake_address2, 256}; + EXPECT_TRUE(blk.is_contiguous_before(blk2)); + block const blk3{fake_address, 512}; + block const blk4{fake_address2, 1_KiB}; + EXPECT_FALSE(blk3.is_contiguous_before(blk4)); +} + +TEST_F(ArenaTest, BlockSplit) // NOLINT +{ + block const blk{fake_address, 2_KiB}; + auto const [head, tail] = blk.split(1_KiB); + EXPECT_EQ(head.pointer(), fake_address); + EXPECT_EQ(head.size(), 1_KiB); + EXPECT_EQ(tail.pointer(), fake_address2); + EXPECT_EQ(tail.size(), 1_KiB); +} + +TEST_F(ArenaTest, BlockMerge) // NOLINT +{ + block const blk{fake_address, 1_KiB}; + block const blk2{fake_address2, 1_KiB}; + auto const merged = blk.merge(blk2); + EXPECT_EQ(merged.pointer(), fake_address); + EXPECT_EQ(merged.size(), 2_KiB); +} + +/** + * Test superblock. + */ + +TEST_F(ArenaTest, SuperblockEmpty) // NOLINT +{ + superblock sblk{fake_address3, superblock::minimum_size}; + EXPECT_TRUE(sblk.empty()); + sblk.first_fit(256); + EXPECT_FALSE(sblk.empty()); +} + +TEST_F(ArenaTest, SuperblockContains) // NOLINT +{ + superblock const sblk{fake_address3, superblock::minimum_size}; + block const blk{fake_address, 2_KiB}; + EXPECT_FALSE(sblk.contains(blk)); + block const blk2{fake_address3, 1_KiB}; + EXPECT_TRUE(sblk.contains(blk2)); + block const blk3{fake_address3, superblock::minimum_size + 1}; + EXPECT_FALSE(sblk.contains(blk3)); + block const blk4{fake_address3, superblock::minimum_size}; + EXPECT_TRUE(sblk.contains(blk4)); + block const blk5{fake_address4, 256}; + EXPECT_FALSE(sblk.contains(blk5)); +} + +TEST_F(ArenaTest, SuperblockFits) // NOLINT +{ + superblock sblk{fake_address3, superblock::minimum_size}; + EXPECT_TRUE(sblk.fits(superblock::minimum_size)); + EXPECT_FALSE(sblk.fits(superblock::minimum_size + 1)); + + auto const blk = sblk.first_fit(superblock::minimum_size / 4); + sblk.first_fit(superblock::minimum_size / 4); + sblk.coalesce(blk); + EXPECT_TRUE(sblk.fits(superblock::minimum_size / 2)); + EXPECT_FALSE(sblk.fits(superblock::minimum_size / 2 + 1)); +} + +TEST_F(ArenaTest, SuperblockIsContiguousBefore) // NOLINT +{ + superblock sblk{fake_address3, superblock::minimum_size}; + superblock sb2{fake_address4, superblock::minimum_size}; + EXPECT_TRUE(sblk.is_contiguous_before(sb2)); + + auto const blk = sblk.first_fit(256); + EXPECT_FALSE(sblk.is_contiguous_before(sb2)); + sblk.coalesce(blk); + EXPECT_TRUE(sblk.is_contiguous_before(sb2)); + + auto const blk2 = sb2.first_fit(1_KiB); + EXPECT_FALSE(sblk.is_contiguous_before(sb2)); + sb2.coalesce(blk2); + EXPECT_TRUE(sblk.is_contiguous_before(sb2)); +} + +TEST_F(ArenaTest, SuperblockSplit) // NOLINT +{ + superblock sblk{fake_address3, superblock::minimum_size * 2}; + auto const [head, tail] = sblk.split(superblock::minimum_size); + EXPECT_EQ(head.pointer(), fake_address3); + EXPECT_EQ(head.size(), superblock::minimum_size); + EXPECT_TRUE(head.empty()); + EXPECT_EQ(tail.pointer(), fake_address4); + EXPECT_EQ(tail.size(), superblock::minimum_size); + EXPECT_TRUE(tail.empty()); +} + +TEST_F(ArenaTest, SuperblockMerge) // NOLINT +{ + superblock sblk{fake_address3, superblock::minimum_size}; + superblock sb2{fake_address4, superblock::minimum_size}; + auto const merged = sblk.merge(sb2); + EXPECT_EQ(merged.pointer(), fake_address3); + EXPECT_EQ(merged.size(), superblock::minimum_size * 2); + EXPECT_TRUE(merged.empty()); +} + +TEST_F(ArenaTest, SuperblockFirstFit) // NOLINT +{ + superblock sblk{fake_address3, superblock::minimum_size}; + auto const blk = sblk.first_fit(1_KiB); + EXPECT_EQ(blk.pointer(), fake_address3); + EXPECT_EQ(blk.size(), 1_KiB); + auto const blk2 = sblk.first_fit(2_KiB); + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + EXPECT_EQ(blk2.pointer(), static_cast(fake_address3) + 1_KiB); + EXPECT_EQ(blk2.size(), 2_KiB); + sblk.coalesce(blk); + auto const blk3 = sblk.first_fit(512); + EXPECT_EQ(blk3.pointer(), fake_address3); + EXPECT_EQ(blk3.size(), 512); +} + +TEST_F(ArenaTest, SuperblockCoalesceAfterFull) // NOLINT +{ + superblock sblk{fake_address3, superblock::minimum_size}; + auto const blk = sblk.first_fit(superblock::minimum_size / 2); + sblk.first_fit(superblock::minimum_size / 2); + sblk.coalesce(blk); + EXPECT_TRUE(sblk.first_fit(superblock::minimum_size / 2).is_valid()); +} + +TEST_F(ArenaTest, SuperblockCoalesceMergeNext) // NOLINT +{ + superblock sblk{fake_address3, superblock::minimum_size}; + auto const blk = sblk.first_fit(superblock::minimum_size / 2); + sblk.coalesce(blk); + EXPECT_TRUE(sblk.first_fit(superblock::minimum_size).is_valid()); +} + +TEST_F(ArenaTest, SuperblockCoalesceMergePrevious) // NOLINT +{ + superblock sblk{fake_address3, superblock::minimum_size}; + auto const blk = sblk.first_fit(1_KiB); + auto const blk2 = sblk.first_fit(1_KiB); + sblk.first_fit(1_KiB); + sblk.coalesce(blk); + sblk.coalesce(blk2); + auto const blk3 = sblk.first_fit(2_KiB); + EXPECT_EQ(blk3.pointer(), fake_address3); +} + +TEST_F(ArenaTest, SuperblockCoalesceMergePreviousAndNext) // NOLINT +{ + superblock sblk{fake_address3, superblock::minimum_size}; + auto const blk = sblk.first_fit(1_KiB); + auto const blk2 = sblk.first_fit(1_KiB); + sblk.coalesce(blk); + sblk.coalesce(blk2); + EXPECT_TRUE(sblk.first_fit(superblock::minimum_size).is_valid()); +} + +TEST_F(ArenaTest, SuperblockMaxFreeSize) // NOLINT +{ + superblock sblk{fake_address3, superblock::minimum_size}; + auto const blk = sblk.first_fit(superblock::minimum_size / 4); + sblk.first_fit(superblock::minimum_size / 4); + sblk.coalesce(blk); + EXPECT_EQ(sblk.max_free_size(), superblock::minimum_size / 2); +} + +TEST_F(ArenaTest, SuperblockMaxFreeSizeWhenFull) // NOLINT +{ + superblock sblk{fake_address3, superblock::minimum_size}; + sblk.first_fit(superblock::minimum_size); + EXPECT_EQ(sblk.max_free_size(), 0); +} + +/** + * Test global_arena. + */ + +TEST_F(ArenaTest, GlobalArenaNullUpstream) // NOLINT +{ + auto construct_nullptr = []() { global_arena global{nullptr, std::nullopt}; }; + EXPECT_THROW(construct_nullptr(), rmm::logic_error); // NOLINT(cppcoreguidelines-avoid-goto) +} + +TEST_F(ArenaTest, GlobalArenaAcquire) // NOLINT +{ + auto const sblk = global->acquire(256); + EXPECT_EQ(sblk.pointer(), fake_address3); + EXPECT_EQ(sblk.size(), superblock::minimum_size); + EXPECT_TRUE(sblk.empty()); + + auto const sb2 = global->acquire(1_KiB); + EXPECT_EQ(sb2.pointer(), fake_address4); + EXPECT_EQ(sb2.size(), superblock::minimum_size); + EXPECT_TRUE(sb2.empty()); + + global->acquire(512); + global->acquire(512); + EXPECT_FALSE(global->acquire(512).is_valid()); +} + +TEST_F(ArenaTest, GlobalArenaReleaseMergeNext) // NOLINT +{ + auto sblk = global->acquire(256); + global->release(std::move(sblk)); + auto* ptr = global->allocate(arena_size); + EXPECT_EQ(ptr, fake_address3); +} + +TEST_F(ArenaTest, GlobalArenaReleaseMergePrevious) // NOLINT +{ + auto sblk = global->acquire(256); + auto sb2 = global->acquire(1_KiB); + global->acquire(512); + global->release(std::move(sblk)); + global->release(std::move(sb2)); + auto* ptr = global->allocate(superblock::minimum_size * 2); + EXPECT_EQ(ptr, fake_address3); +} + +TEST_F(ArenaTest, GlobalArenaReleaseMergePreviousAndNext) // NOLINT +{ + auto sblk = global->acquire(256); + auto sb2 = global->acquire(1_KiB); + auto sb3 = global->acquire(512); + global->release(std::move(sblk)); + global->release(std::move(sb3)); + global->release(std::move(sb2)); + auto* ptr = global->allocate(arena_size); + EXPECT_EQ(ptr, fake_address3); +} + +TEST_F(ArenaTest, GlobalArenaReleaseMultiple) // NOLINT +{ + std::set superblocks{}; + auto sblk = global->acquire(256); + superblocks.insert(std::move(sblk)); + auto sb2 = global->acquire(1_KiB); + superblocks.insert(std::move(sb2)); + auto sb3 = global->acquire(512); + superblocks.insert(std::move(sb3)); + global->release(superblocks); + auto* ptr = global->allocate(arena_size); + EXPECT_EQ(ptr, fake_address3); +} + +TEST_F(ArenaTest, GlobalArenaAllocate) // NOLINT +{ + auto* ptr = global->allocate(superblock::minimum_size * 2); + EXPECT_EQ(ptr, fake_address3); +} + +TEST_F(ArenaTest, GlobalArenaAllocateExtraLarge) // NOLINT +{ + EXPECT_EQ(global->allocate(1_PiB), nullptr); + EXPECT_EQ(global->allocate(1_PiB), nullptr); +} + +TEST_F(ArenaTest, GlobalArenaDeallocate) // NOLINT +{ + auto* ptr = global->allocate(superblock::minimum_size * 2); + EXPECT_EQ(ptr, fake_address3); + global->deallocate(ptr, superblock::minimum_size * 2, {}); + ptr = global->allocate(superblock::minimum_size * 2); + EXPECT_EQ(ptr, fake_address3); +} + +TEST_F(ArenaTest, GlobalArenaDeallocateAlignUp) // NOLINT +{ + auto* ptr = global->allocate(superblock::minimum_size + 256); + auto* ptr2 = global->allocate(superblock::minimum_size + 512); + global->deallocate(ptr, superblock::minimum_size + 256, {}); + global->deallocate(ptr2, superblock::minimum_size + 512, {}); + EXPECT_EQ(global->allocate(arena_size), fake_address3); +} + +TEST_F(ArenaTest, GlobalArenaDeallocateFromOtherArena) // NOLINT +{ + auto sblk = global->acquire(512); + auto const blk = sblk.first_fit(512); + auto const blk2 = sblk.first_fit(1024); + global->release(std::move(sblk)); + global->deallocate(blk.pointer(), blk.size()); + global->deallocate(blk2.pointer(), blk2.size()); + EXPECT_EQ(global->allocate(arena_size), fake_address3); +} + +/** + * Test arena. + */ + +TEST_F(ArenaTest, ArenaAllocate) // NOLINT +{ + EXPECT_EQ(per_thread->allocate(superblock::minimum_size), fake_address3); + EXPECT_EQ(per_thread->allocate(256), fake_address4); +} + +TEST_F(ArenaTest, ArenaDeallocate) // NOLINT +{ + auto* ptr = per_thread->allocate(superblock::minimum_size); + per_thread->deallocate(ptr, superblock::minimum_size, {}); + auto* ptr2 = per_thread->allocate(256); + per_thread->deallocate(ptr2, 256, {}); + EXPECT_EQ(per_thread->allocate(superblock::minimum_size), fake_address3); +} + +TEST_F(ArenaTest, ArenaDeallocateMergePrevious) // NOLINT +{ + auto* ptr = per_thread->allocate(256); + auto* ptr2 = per_thread->allocate(256); + per_thread->allocate(256); + per_thread->deallocate(ptr, 256, {}); + per_thread->deallocate(ptr2, 256, {}); + EXPECT_EQ(per_thread->allocate(512), fake_address3); +} + +TEST_F(ArenaTest, ArenaDeallocateMergeNext) // NOLINT +{ + auto* ptr = per_thread->allocate(256); + auto* ptr2 = per_thread->allocate(256); + per_thread->allocate(256); + per_thread->deallocate(ptr2, 256, {}); + per_thread->deallocate(ptr, 256, {}); + EXPECT_EQ(per_thread->allocate(512), fake_address3); +} + +TEST_F(ArenaTest, ArenaDeallocateMergePreviousAndNext) // NOLINT +{ + auto* ptr = per_thread->allocate(256); + auto* ptr2 = per_thread->allocate(256); + per_thread->deallocate(ptr, 256, {}); + per_thread->deallocate(ptr2, 256, {}); + EXPECT_EQ(per_thread->allocate(2_KiB), fake_address3); +} + +TEST_F(ArenaTest, ArenaDefragment) // NOLINT +{ + std::vector pointers; + std::size_t num_pointers{4}; + for (std::size_t i = 0; i < num_pointers; i++) { + pointers.push_back(per_thread->allocate(superblock::minimum_size)); + } + for (auto* ptr : pointers) { + per_thread->deallocate(ptr, superblock::minimum_size, {}); + } + EXPECT_EQ(global->allocate(arena_size), nullptr); + per_thread->defragment(); + EXPECT_EQ(global->allocate(arena_size), fake_address3); +} + +/** + * Test arena_memory_resource. + */ + +TEST_F(ArenaTest, ThrowOnNullUpstream) // NOLINT { auto construct_nullptr = []() { arena_mr mr{nullptr}; }; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto) EXPECT_THROW(construct_nullptr(), rmm::logic_error); } -TEST(ArenaTest, ThrowMaxLessThanInitial) +TEST_F(ArenaTest, SizeSmallerThanSuperblockSize) // NOLINT { - // Make sure first argument is enough larger than the second that alignment rounding doesn't - // make them equal - auto max_less_than_initial = []() { - const auto initial{4_MiB}; - const auto maximum{2_MiB}; - cuda_mr cuda; - arena_mr mr{&cuda, initial, maximum}; - }; - EXPECT_THROW(max_less_than_initial(), rmm::logic_error); + auto construct_small = []() { arena_mr mr{rmm::mr::get_current_device_resource(), 256}; }; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto) + EXPECT_THROW(construct_small(), rmm::logic_error); +} + +TEST_F(ArenaTest, AllocateNinetyPercent) // NOLINT +{ + EXPECT_NO_THROW([]() { // NOLINT(cppcoreguidelines-avoid-goto) + auto const free = rmm::detail::available_device_memory().first; + auto const ninety_percent = + rmm::detail::align_up(static_cast(static_cast(free) * 0.9), + rmm::detail::CUDA_ALLOCATION_ALIGNMENT); + arena_mr mr(rmm::mr::get_current_device_resource(), ninety_percent); + }()); +} + +TEST_F(ArenaTest, SmallMediumLarge) // NOLINT +{ + EXPECT_NO_THROW([]() { // NOLINT(cppcoreguidelines-avoid-goto) + arena_mr mr(rmm::mr::get_current_device_resource()); + auto* small = mr.allocate(256); + auto* medium = mr.allocate(64_MiB); + auto const free = rmm::detail::available_device_memory().first; + auto* large = mr.allocate(free / 3); + mr.deallocate(small, 256); + mr.deallocate(medium, 64_MiB); + mr.deallocate(large, free / 3); + }()); +} + +TEST_F(ArenaTest, Defragment) // NOLINT +{ + EXPECT_NO_THROW([]() { // NOLINT(cppcoreguidelines-avoid-goto) + auto const arena_size = superblock::minimum_size * 4; + arena_mr mr(rmm::mr::get_current_device_resource(), arena_size); + std::vector threads; + std::size_t num_threads{4}; + threads.reserve(num_threads); + for (std::size_t i = 0; i < num_threads; ++i) { + threads.emplace_back(std::thread([&] { + cuda_stream stream{}; + void* ptr = mr.allocate(32_KiB, stream); + mr.deallocate(ptr, 32_KiB, stream); + })); + } + for (auto& thread : threads) { + thread.join(); + } + + auto* ptr = mr.allocate(arena_size); + mr.deallocate(ptr, arena_size); + }()); } -TEST(ArenaTest, DumpLogOnFailure) +TEST_F(ArenaTest, DumpLogOnFailure) // NOLINT { - cuda_mr cuda; - arena_mr mr{&cuda, 1_MiB, 4_MiB, true}; + arena_mr mr{rmm::mr::get_current_device_resource(), 1_MiB, true}; { // make the log interesting std::vector threads; @@ -77,6 +553,7 @@ TEST(ArenaTest, DumpLogOnFailure) } } + // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto) EXPECT_THROW(mr.allocate(8_MiB), rmm::out_of_memory); struct stat file_status { @@ -85,10 +562,9 @@ TEST(ArenaTest, DumpLogOnFailure) EXPECT_GE(file_status.st_size, 0); } -TEST(ArenaTest, FeatureSupport) +TEST_F(ArenaTest, FeatureSupport) // NOLINT { - cuda_mr cuda; - arena_mr mr{&cuda, 1_MiB, 4_MiB}; + arena_mr mr{rmm::mr::get_current_device_resource(), 1_MiB}; EXPECT_TRUE(mr.supports_streams()); EXPECT_FALSE(mr.supports_get_mem_info()); auto [free, total] = mr.get_mem_info(rmm::cuda_stream_default);