Reland "base/allocator: Add a thread cache to PartitionAlloc."

Changes: disabled the thread cache on Windows. Original change's description: > base/allocator: Add a thread cache to PartitionAlloc. > > This CL adds a thread cache to PartitionAlloc. It is optional, only > applies to thread-safe partitions, and uses the same freelist encoding > and bucketing as the main allocator. > > The thread cache is added "in the middle" of the main allocator, that is: > - After all the cookie/tag management > - Before the "raw" allocator. > > That is, the general allocation flow is: > 1. Adjustment of requested size to make room for tags / cookies > 2. Allocation: > a. Call to the thread cache, if it succeeds, return. > b. Otherwise, call the "raw" allocator <-- Locking > 3. Handle cookies/tags, zero allocation if required > > On the deallocation side, the process is reversed: > 1. Check cookies / tags, adjust the pointer > 2. Deallocation > a. Return to the thread cache of possible. If it succeeds, return. > b. Otherwise, call the "raw" allocator <-- Locking > > The thread cache maintains an array of buckets, the same as the parent > allocator. A single thread cache instance is only used by a single > partition. Each bucket is a linked list of allocations, capped to a set > maximum size. Elements in this "freelist" are encoded the same way they > are for the main allocator. > Only the smallest buckets are eligible for caching, to reduce the > memory impact. > > There are several limitations: > - Only a single partition is allowed to have a thread cache > - No periodic purging of thread caches is done > - No statistics are collected > > The last two limitations will be addressed in subsequent CLs. Regarding > the first one, it is not possible to use Chrome's native thread local > storage support, as it allocates. It is also desirable to use > thread_local to improve performance. > > Bug: 998048 > Change-Id: Ia771f507d9dd1c2c26a4668c76da220fb0c65dd4 > Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2375206 > Commit-Queue: Benoit L <lizeb@chromium.org> > Reviewed-by: Kentaro Hara <haraken@chromium.org> > Cr-Commit-Position: refs/heads/master@{#805697} Bug: 998048 Change-Id: I23b70f6964bb297502921d1a08bf128d9093d577 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2404849 Reviewed-by: Kentaro Hara <haraken@chromium.org> Reviewed-by: Benoit L <lizeb@chromium.org> Commit-Queue: Benoit L <lizeb@chromium.org> Cr-Commit-Position: refs/heads/master@{#806597}
Denger-Network · Sep 14, 2020 · d4e7ee6 · d4e7ee6
1 parent cc7fafa
commit d4e7ee6
Show file tree

Hide file tree

Showing 8 changed files with 602 additions and 32 deletions.
diff --git a/base/BUILD.gn b/base/BUILD.gn
@@ -1793,6 +1793,8 @@ component("base") {
         "allocator/partition_allocator/partition_tag_bitmap.h",
         "allocator/partition_allocator/random.cc",
         "allocator/partition_allocator/random.h",
+        "allocator/partition_allocator/thread_cache.cc",
+        "allocator/partition_allocator/thread_cache.h",
       ]
       if (is_win) {
         sources +=
@@ -3217,6 +3219,7 @@ test("base_unittests") {
       "allocator/partition_allocator/memory_reclaimer_unittest.cc",
       "allocator/partition_allocator/page_allocator_unittest.cc",
       "allocator/partition_allocator/partition_alloc_unittest.cc",
+      "allocator/partition_allocator/thread_cache_unittest.cc",
     ]
   }
 

diff --git a/base/allocator/allocator_shim_default_dispatch_to_partition_alloc.cc b/base/allocator/allocator_shim_default_dispatch_to_partition_alloc.cc
@@ -5,6 +5,7 @@
 #include "base/allocator/allocator_shim.h"
 #include "base/allocator/allocator_shim_internals.h"
 #include "base/allocator/partition_allocator/partition_alloc.h"
+#include "base/allocator/partition_allocator/partition_alloc_constants.h"
 #include "base/bits.h"
 #include "base/no_destructor.h"
 #include "build/build_config.h"
@@ -74,8 +75,8 @@ base::ThreadSafePartitionRoot& Allocator() {
     return *root;
   }
 
-  auto* new_root = new (g_allocator_buffer)
-      base::ThreadSafePartitionRoot(false /* enforce_alignment */);
+  auto* new_root = new (g_allocator_buffer) base::ThreadSafePartitionRoot(
+      false /* enforce_alignment */, true /* enable_thread_cache */);
   g_root_.store(new_root, std::memory_order_release);
 
   // Semantically equivalent to base::Lock::Release().
@@ -100,8 +101,9 @@ void* PartitionMemalign(const AllocatorDispatch*,
                         size_t alignment,
                         size_t size,
                         void* context) {
+  // Since the general-purpose allocator uses the thread cache, this one cannot.
   static base::NoDestructor<base::ThreadSafePartitionRoot> aligned_allocator{
-      true /* enforce_alignment */};
+      true /* enforce_alignment */, false /* enable_thread_cache */};
   return aligned_allocator->AlignedAllocFlags(base::PartitionAllocNoHooks,
                                               alignment, size);
 }

diff --git a/base/allocator/partition_allocator/partition_alloc.cc b/base/allocator/partition_allocator/partition_alloc.cc
@@ -201,7 +201,8 @@ void PartitionAllocGlobalUninitForTesting() {
 }
 
 template <bool thread_safe>
-void PartitionRoot<thread_safe>::Init(bool enforce_alignment) {
+void PartitionRoot<thread_safe>::Init(bool enforce_alignment,
+                                      bool enable_thread_cache) {
   ScopedGuard guard{lock_};
   if (initialized)
     return;
@@ -215,6 +216,15 @@ void PartitionRoot<thread_safe>::Init(bool enforce_alignment) {
   // If alignment needs to be enforced, disallow adding cookies and/or tags at
   // the beginning of the slot.
   allow_extras = !enforce_alignment;
+#if defined(OS_WIN)
+  // Not supported on Windows due to TLS issues.
+  with_thread_cache = false;
+#else
+  with_thread_cache = enable_thread_cache;
+
+  if (with_thread_cache)
+    internal::ThreadCache::ClaimThreadCacheAndCheck();
+#endif
 
   // We mark the sentinel bucket/page as free to make sure it is skipped by our
   // logic to find a new active page.
@@ -280,6 +290,9 @@ void PartitionRoot<thread_safe>::Init(bool enforce_alignment) {
   initialized = true;
 }
 
+template <bool thread_safe>
+PartitionRoot<thread_safe>::~PartitionRoot() = default;
+
 template <bool thread_safe>
 bool PartitionRoot<thread_safe>::ReallocDirectMappedInPlace(
     internal::PartitionPage<thread_safe>* page,
@@ -619,6 +632,10 @@ void PartitionRoot<thread_safe>::PurgeMemory(int flags) {
         PartitionPurgeBucket(bucket);
     }
   }
+
+  // Purges only this thread's cache.
+  if (with_thread_cache && internal::g_thread_cache)
+    internal::g_thread_cache->Purge();
 }
 
 template <bool thread_safe>
@@ -806,7 +823,8 @@ void PartitionAllocator<thread_safe>::init(
     PartitionAllocatorAlignment alignment) {
   partition_root_.Init(
       alignment ==
-      PartitionAllocatorAlignment::kAlignedAlloc /* enforce_alignment */);
+          PartitionAllocatorAlignment::kAlignedAlloc /* enforce_alignment */,
+      false);
   PartitionAllocMemoryReclaimer::Instance()->RegisterPartition(
       &partition_root_);
 }

diff --git a/base/allocator/partition_allocator/partition_alloc.h b/base/allocator/partition_allocator/partition_alloc.h
@@ -53,6 +53,7 @@
 
 #include <limits.h>
 #include <string.h>
+#include <memory>
 
 #include <atomic>
 
@@ -68,6 +69,7 @@
 #include "base/allocator/partition_allocator/partition_page.h"
 #include "base/allocator/partition_allocator/partition_ref_count.h"
 #include "base/allocator/partition_allocator/partition_tag.h"
+#include "base/allocator/partition_allocator/thread_cache.h"
 #include "base/base_export.h"
 #include "base/bits.h"
 #include "base/check_op.h"
@@ -371,6 +373,8 @@ struct BASE_EXPORT PartitionRoot {
   using DirectMapExtent = internal::PartitionDirectMapExtent<thread_safe>;
   using ScopedGuard = internal::ScopedGuard<thread_safe>;
 
+  bool with_thread_cache = false;
+
   internal::MaybeSpinLock<thread_safe> lock_;
   // Invariant: total_size_of_committed_pages <=
   //                total_size_of_super_pages +
@@ -409,10 +413,10 @@ struct BASE_EXPORT PartitionRoot {
   Bucket buckets[kNumBuckets] = {};
 
   PartitionRoot() = default;
-  explicit PartitionRoot(bool enable_tag_pointers) {
-    Init(enable_tag_pointers);
+  PartitionRoot(bool enable_tag_pointers, bool enable_thread_cache) {
+    Init(enable_tag_pointers, enable_thread_cache);
   }
-  ~PartitionRoot() = default;
+  ~PartitionRoot();
 
   // Public API
   //
@@ -424,7 +428,7 @@ struct BASE_EXPORT PartitionRoot {
   //
   // Moving it a layer lower couples PartitionRoot and PartitionBucket, but
   // preserves the layering of the includes.
-  void Init(bool enforce_alignment);
+  void Init(bool enforce_alignment, bool enable_thread_cache);
 
   ALWAYS_INLINE static bool IsValidPage(Page* page);
   ALWAYS_INLINE static PartitionRoot* FromPage(Page* page);
@@ -462,9 +466,7 @@ struct BASE_EXPORT PartitionRoot {
   // this is marked |ALWAYS_INLINE|.
   ALWAYS_INLINE void* AllocFlagsNoHooks(int flags, size_t size);
 
-  ALWAYS_INLINE void* Realloc(void* ptr,
-                              size_t new_size,
-                              const char* type_name);
+  ALWAYS_INLINE void* Realloc(void* ptr, size_t newize, const char* type_name);
   // Overload that may return nullptr if reallocation isn't possible. In this
   // case, |ptr| remains valid.
   ALWAYS_INLINE void* TryRealloc(void* ptr,
@@ -495,13 +497,18 @@ struct BASE_EXPORT PartitionRoot {
   // Frees memory, with |ptr| as returned by |RawAlloc()|.
   ALWAYS_INLINE void RawFree(void* ptr, Page* page);
 
+  internal::ThreadCache* thread_cache_for_testing() const {
+    return with_thread_cache ? internal::g_thread_cache.get() : nullptr;
+  }
+
  private:
   // Allocates memory, without any cookies / tags.
   //
   // |flags| and |size| are as in AllocFlags(). |allocated_size| and
   // is_already_zeroed| are output only. |allocated_size| is guaranteed to be
   // larger or equal to |size|.
-  ALWAYS_INLINE void* RawAlloc(int flags,
+  ALWAYS_INLINE void* RawAlloc(Bucket* bucket,
+                               int flags,
                                size_t size,
                                size_t* allocated_size,
                                bool* is_already_zeroed);
@@ -523,13 +530,13 @@ struct BASE_EXPORT PartitionRoot {
 #endif
   }
 
- private:
-  ALWAYS_INLINE void* AllocFromBucket(Bucket* bucket, int flags, size_t size)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
   bool ReallocDirectMappedInPlace(internal::PartitionPage<thread_safe>* page,
                                   size_t raw_size)
       EXCLUSIVE_LOCKS_REQUIRED(lock_);
   void DecommitEmptyPages() EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  static void RawFreeStatic(void* ptr);
+
+  friend class internal::ThreadCache;
 };
 
 static_assert(sizeof(PartitionRoot<internal::ThreadSafe>) ==
@@ -612,6 +619,15 @@ ALWAYS_INLINE void PartitionRoot<thread_safe>::Free(void* ptr) {
 // static
 template <bool thread_safe>
 ALWAYS_INLINE void PartitionRoot<thread_safe>::FreeNoHooks(void* ptr) {
+  // The thread cache is added "in the middle" of the main allocator, that is:
+  // - After all the cookie/tag management
+  // - Before the "raw" allocator.
+  //
+  // On the deallocation side:
+  // 1. Check cookies / tags, adjust the pointer
+  // 2. Deallocation
+  //   a. Return to the thread cache of possible. If it succeeds, return.
+  //   b. Otherwise, call the "raw" allocator <-- Locking
   if (UNLIKELY(!ptr))
     return;
 
@@ -678,6 +694,19 @@ ALWAYS_INLINE void PartitionRoot<thread_safe>::FreeNoHooks(void* ptr) {
   memset(ptr, kFreedByte, page->GetAllocatedSize());
 #endif
 
+  // TLS access can be expensive, do a cheap local check first.
+  //
+  // Also the thread-unsafe variant doesn't have a use for a thread cache, so
+  // make it statically known to the compiler.
+  if (thread_safe && root->with_thread_cache &&
+      LIKELY(!page->bucket->is_direct_mapped())) {
+    PA_DCHECK(page->bucket >= root->buckets);
+    size_t bucket_index = page->bucket - root->buckets;
+    internal::ThreadCache* thread_cache = internal::g_thread_cache.get();
+    if (thread_cache && thread_cache->MaybePutInCache(ptr, bucket_index))
+      return;
+  }
+
   root->RawFree(ptr, page);
 }
 
@@ -691,6 +720,14 @@ ALWAYS_INLINE void PartitionRoot<thread_safe>::RawFree(void* ptr, Page* page) {
   deferred_unmap.Run();
 }
 
+// static
+template <bool thread_safe>
+void PartitionRoot<thread_safe>::RawFreeStatic(void* ptr) {
+  Page* page = Page::FromPointerNoAlignmentCheck(ptr);
+  auto* root = PartitionRoot<thread_safe>::FromPage(page);
+  root->RawFree(ptr, page);
+}
+
 // static
 template <bool thread_safe>
 ALWAYS_INLINE bool PartitionRoot<thread_safe>::IsValidPage(Page* page) {
@@ -811,7 +848,6 @@ PartitionRoot<thread_safe>::SizeToBucket(size_t size) const {
   size_t sub_order_index = size & kOrderSubIndexMask[order];
   Bucket* bucket = bucket_lookups[(order << kNumBucketsPerOrderBits) +
                                   order_index + !!sub_order_index];
-  PA_CHECK(bucket);
   PA_DCHECK(!bucket->slot_size || bucket->slot_size >= size);
   PA_DCHECK(!(bucket->slot_size % kSmallestBucket));
   return bucket;
@@ -834,7 +870,7 @@ ALWAYS_INLINE void* PartitionRoot<thread_safe>::AllocFlags(
   return result;
 #else
   PA_DCHECK(initialized);
-  void* ret;
+  void* ret = nullptr;
   const bool hooks_enabled = PartitionAllocHooks::AreHooksEnabled();
   if (UNLIKELY(hooks_enabled)) {
     if (PartitionAllocHooks::AllocationOverrideHookIfEnabled(&ret, flags, size,
@@ -858,13 +894,63 @@ ALWAYS_INLINE void* PartitionRoot<thread_safe>::AllocFlags(
 template <bool thread_safe>
 ALWAYS_INLINE void* PartitionRoot<thread_safe>::AllocFlagsNoHooks(int flags,
                                                                   size_t size) {
+  // The thread cache is added "in the middle" of the main allocator, that is:
+  // - After all the cookie/tag management
+  // - Before the "raw" allocator.
+  //
+  // That is, the general allocation flow is:
+  // 1. Adjustment of requested size to make room for tags / cookies
+  // 2. Allocation:
+  //   a. Call to the thread cache, if it succeeds, go to step 3.
+  //   b. Otherwise, call the "raw" allocator <-- Locking
+  // 3. Handle cookies/tags, zero allocation if required
   size_t requested_size = size;
   size = internal::PartitionSizeAdjustAdd(allow_extras, size);
   PA_CHECK(size >= requested_size);  // check for overflows
 
+  auto* bucket = SizeToBucket(size);
   size_t allocated_size;
   bool is_already_zeroed;
-  void* ret = RawAlloc(flags, size, &allocated_size, &is_already_zeroed);
+  void* ret = nullptr;
+
+  // !thread_safe => !with_thread_cache, but adding the condition allows the
+  // compiler to statically remove this branch for the thread-unsafe variant.
+  if (thread_safe && with_thread_cache) {
+    if (UNLIKELY(!internal::g_thread_cache))
+      internal::g_thread_cache = internal::ThreadCache::Create(this);
+    // bucket->slot_size is 0 for direct-mapped allocations, as their bucket is
+    // the sentinel one. However, since we are touching *bucket, we may as well
+    // check it directly, rather than fetching the sentinel one, and comparing
+    // the addresses. Since the sentinel bucket is *not* part of the the buckets
+    // array, |bucket_index| is not valid for the sentinel one.
+    //
+    // TODO(lizeb): Consider making Bucket::sentinel per-PartitionRoot, at the
+    // end of the |buckets| array. This would remove this branch.
+    if (LIKELY(bucket->slot_size)) {
+      PA_DCHECK(bucket != Bucket::get_sentinel_bucket());
+      PA_DCHECK(bucket >= buckets && bucket < (buckets + kNumBuckets));
+      size_t bucket_index = bucket - buckets;
+      ret = internal::g_thread_cache->GetFromCache(bucket_index);
+      is_already_zeroed = false;
+      allocated_size = bucket->slot_size;
+
+#if DCHECK_IS_ON()
+      // Make sure that the allocated pointer comes from the same place it would
+      // for a non-thread cache allocation.
+      if (ret) {
+        Page* page = Page::FromPointerNoAlignmentCheck(ret);
+        PA_DCHECK(IsValidPage(page));
+        PA_DCHECK(page->bucket == &buckets[bucket_index]);
+      }
+#endif
+    } else {
+      PA_DCHECK(bucket == Bucket::get_sentinel_bucket());
+    }
+  }
+
+  if (!ret)
+    ret = RawAlloc(bucket, flags, size, &allocated_size, &is_already_zeroed);
+
   if (UNLIKELY(!ret))
     return nullptr;
 
@@ -925,18 +1011,14 @@ ALWAYS_INLINE void* PartitionRoot<thread_safe>::AllocFlagsNoHooks(int flags,
 
 template <bool thread_safe>
 ALWAYS_INLINE void* PartitionRoot<thread_safe>::RawAlloc(
+    Bucket* bucket,
     int flags,
     size_t size,
     size_t* allocated_size,
     bool* is_already_zeroed) {
-  auto* bucket = SizeToBucket(size);
-  PA_DCHECK(bucket);
-
-  {
-    internal::ScopedGuard<thread_safe> guard{lock_};
-    return AllocFromBucket(bucket, flags, size, allocated_size,
-                           is_already_zeroed);
-  }
+  internal::ScopedGuard<thread_safe> guard{lock_};
+  return AllocFromBucket(bucket, flags, size, allocated_size,
+                         is_already_zeroed);
 }
 
 template <bool thread_safe>

diff --git a/base/allocator/partition_allocator/partition_alloc_perftest.cc b/base/allocator/partition_allocator/partition_alloc_perftest.cc
@@ -2,7 +2,10 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
+#include <algorithm>
 #include <atomic>
+#include <limits>
+#include <memory>
 #include <vector>
 
 #include "base/allocator/partition_allocator/partition_alloc.h"
@@ -30,7 +33,7 @@ namespace {
 // Change kTimeLimit to something higher if you need more time to capture a
 // trace.
 constexpr base::TimeDelta kTimeLimit = base::TimeDelta::FromSeconds(2);
-constexpr int kWarmupRuns = 5;
+constexpr int kWarmupRuns = 10000;
 constexpr int kTimeCheckInterval = 100000;
 
 // Size constants are mostly arbitrary, but try to simulate something like CSS
@@ -78,12 +81,10 @@ class PartitionAllocator : public Allocator {
   void* Alloc(size_t size) override {
     return alloc_.AllocFlagsNoHooks(0, size);
   }
-  void Free(void* data) override {
-    base::ThreadSafePartitionRoot::FreeNoHooks(data);
-  }
+  void Free(void* data) override { ThreadSafePartitionRoot::FreeNoHooks(data); }
 
  private:
-  base::ThreadSafePartitionRoot alloc_{false};
+  ThreadSafePartitionRoot alloc_{false, false};
 };
 
 class TestLoopThread : public PlatformThread::Delegate {