[Static runtime] refactor MemoryPlanner codes to prepare for output t…

…ensor memory planning (pytorch#55809) Summary: Pull Request resolved: pytorch#55809 [Static runtime] refactor MemoryPlanner codes to prepare for output tensor memory planning Test Plan: buck test mode/dev //caffe2/caffe2/fb/predictor:pytorch_predictor_test -- --exact 'caffe2/caffe2/fb/predictor:pytorch_predictor_test - PyTorchPredictor.StaticRuntime' Reviewed By: bwasti Differential Revision: D27411416 fbshipit-source-id: 7dae7c2586ce3b4ebacf6169017140166c30e99c
zmmwl · Apr 13, 2021 · 18662d4 · 18662d4
1 parent 6269efd
commit 18662d4
Show file tree

Hide file tree

Showing 4 changed files with 104 additions and 59 deletions.
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
@@ -969,7 +969,7 @@ void StaticRuntime::check_for_memory_leak(bool output_returned) {
         // check for intermediates
         if (!ival->isNone()) {
           TORCH_CHECK(
-              ival->isTensor() || canOptimizeConstruct(pnode.node()),
+              ival->isTensor() || isOptimizableContainerType(pnode.node()),
               error_msg);
           if (ival->isTensor()) {
             const auto& t = ival->toTensor();
@@ -989,55 +989,14 @@ void StaticRuntime::check_for_memory_leak(bool output_returned) {
   }
 }
 
-MemoryPlanner::MemoryPlanner(
-    StaticRuntime* runtime,
+static void assign_storage_to_managed_tensors(
+    const StaticRuntime* runtime,
+    const std::unordered_set<const Value*>& managed_tensor_values,
     const std::unordered_map<const Value*, std::vector<const Value*>>&
         value_to_same_storage_values,
-    const std::unordered_set<const Value*>& external_values,
-    bool out_variants) {
-  // collect register indices of outputs of ops with out variant
-  std::unordered_set<const Value*> managed_values;
-  std::unordered_set<IValue*> unmanaged_ivalues;
-  for (ProcessedNode& pnode : runtime->nodes()) {
-    if (canReuseInputsOutputs(pnode.node())) {
-      for (auto i = 0; i < pnode.outputs().size(); ++i) {
-        // Types are stored in the underlying TorchScript IR
-        const Value* out_v = pnode.node()->outputs()[i];
-        IValue& out = pnode.Output(i);
-        const auto& type = out_v->type();
-        if (out_variants && !external_values.count(out_v)) {
-          if (type->cast<TensorType>()) {
-            managed_values.insert(out_v);
-          } else if (canOptimizeConstruct(pnode.node())) {
-            // We "leak" containers of this type
-          } else {
-            unmanaged_ivalues.insert(&out);
-          }
-        } else {
-          unmanaged_ivalues.insert(&out);
-        }
-      }
-    } else {
-      for (auto i = 0; i < pnode.outputs().size(); ++i) {
-        unmanaged_ivalues.insert(&pnode.Output(i));
-      }
-    }
-  }
-
-  // remove model outputs from managed_values and unmanaged_ivalues
-  for (const Value* output : runtime->graph().outputs()) {
-    managed_values.erase(output);
-  }
-  for (IValue* output : runtime->outputs()) {
-    unmanaged_ivalues.erase(output);
-  }
-
-  // unmanaged_ivalues => unmanaged_ivalues_
-  for (IValue* out : unmanaged_ivalues) {
-    unmanaged_ivalues_.emplace_back(out);
-  }
-
-  // map Value to index to managed_storage_, where multiple values can
+    std::vector<std::pair<size_t, std::vector<c10::StorageImpl*>>>&
+        managed_storage) {
+  // map Value to index to managed_storage, where multiple values can
   // map to the same index (i.e., sharing the same storage)
   std::unordered_map<const Value*, size_t> value_to_storage_idx;
   // the StorageImpls of Tensor views should not be managed
@@ -1048,7 +1007,7 @@ MemoryPlanner::MemoryPlanner(
     for (auto i = 0; i < pnode.outputs().size(); ++i) {
       const auto& ival = pnode.outputs()[i];
       const auto* val = pnode.node()->outputs()[i];
-      if (managed_values.count(val)) {
+      if (managed_tensor_values.count(val)) {
         TORCH_CHECK(ival.isTensor());
         auto* impl = ival.toTensor().storage().unsafeGetStorageImpl();
 
@@ -1058,17 +1017,18 @@ MemoryPlanner::MemoryPlanner(
         }
 
         if (value_to_storage_idx.count(val)) {
-          managed_storage_[value_to_storage_idx.at(val)].second.emplace_back(
+          managed_storage[value_to_storage_idx.at(val)].second.emplace_back(
               impl);
         } else {
           auto p =
               std::make_pair<size_t, std::vector<c10::StorageImpl*>>(0, {impl});
-          managed_storage_.emplace_back(std::move(p));
+          managed_storage.emplace_back(std::move(p));
           // first of a group, update the value_to_storage_idx map with the
           // index
           if (value_to_same_storage_values.count(val)) {
+            auto storage_idx = managed_storage.size() - 1;
             for (const auto* v : value_to_same_storage_values.at(val)) {
-              value_to_storage_idx[v] = managed_storage_.size() - 1;
+              value_to_storage_idx[v] = storage_idx;
             }
           }
         }
@@ -1077,6 +1037,73 @@ MemoryPlanner::MemoryPlanner(
   }
 }
 
+MemoryPlanner::MemoryPlanner(
+    StaticRuntime* runtime,
+    const std::unordered_map<const Value*, std::vector<const Value*>>&
+        value_to_same_storage_values,
+    const std::unordered_set<const Value*>& external_values,
+    bool out_variants) {
+  // collect register indices of outputs of ops with out variant
+  std::unordered_set<const Value*> managed_tensor_values;
+  std::unordered_set<const Value*> leaked_values;
+  if (out_variants) {
+    for (ProcessedNode& pnode : runtime->nodes()) {
+      if (canReuseInputsOutputs(pnode.node())) {
+        for (auto i = 0; i < pnode.outputs().size(); ++i) {
+          const Value* out_v = pnode.node()->outputs()[i];
+          if (external_values.count(out_v)) {
+            continue;
+          }
+          // Types are stored in the underlying TorchScript IR
+          const auto& type = out_v->type();
+          if (type->cast<TensorType>()) {
+            managed_tensor_values.insert(out_v);
+          } else if (isOptimizableContainerType(pnode.node())) {
+            // We "leak" certain container types because their allocations take
+            // a long time
+            leaked_values.insert(out_v);
+          }
+        }
+      }
+    }
+  }
+
+  // collect unmanaged output ivalues
+  std::unordered_set<IValue*> unmanaged_ivalues;
+  for (ProcessedNode& pnode : runtime->nodes()) {
+    for (auto i = 0; i < pnode.outputs().size(); ++i) {
+      // Types are stored in the underlying TorchScript IR
+      const Value* out_v = pnode.node()->outputs()[i];
+      if (managed_tensor_values.count(out_v) || leaked_values.count(out_v)) {
+        continue;
+      }
+      IValue& out = pnode.Output(i);
+      unmanaged_ivalues.insert(&out);
+    }
+  }
+  // since runtime->outputs() escape from run(), remove them from
+  // managed_tensor_values and from unmanaged_ivalues
+  for (const Value* output : runtime->graph().outputs()) {
+    managed_tensor_values.erase(output);
+  }
+  for (IValue* output : runtime->outputs()) {
+    unmanaged_ivalues.erase(output);
+  }
+
+  // copy to unmanaged_ivalues_
+  for (IValue* out : unmanaged_ivalues) {
+    unmanaged_ivalues_.emplace_back(out);
+  }
+
+  if (out_variants) {
+    ::torch::jit::assign_storage_to_managed_tensors(
+        runtime,
+        managed_tensor_values,
+        value_to_same_storage_values,
+        managed_tensor_storage_);
+  }
+}
+
 // Don't change the size if it is already aligned, otherwise increase the size
 // to make it aligned.
 size_t MemoryPlanner::compute_aligned_tensor_size(size_t nbytes) {
@@ -1099,7 +1126,7 @@ void MemoryPlanner::allocate() {
   uint8_t* start = static_cast<uint8_t*>(buffer_.get());
 
   reused_tensors_ = 0;
-  for (const auto& ms : managed_storage_) {
+  for (const auto& ms : managed_tensor_storage_) {
     auto tensor_size = ms.first;
     if (tensor_size == 0) {
       continue;
@@ -1125,17 +1152,25 @@ void MemoryPlanner::deallocate() {
 
   // free memory used by outputs of ops in out variants
   // but keep the TensorImpl and StorageImpl around
-  for (auto& ms : managed_storage_) {
+  for (auto& ms : managed_tensor_storage_) {
     const auto& impls = ms.second;
     size_t max = 0;
     for (auto& impl : impls) {
       size_t current_size = compute_aligned_tensor_size(impl->nbytes());
       impl->reset();
       max = std::max(max, current_size);
     }
+    // Static runtime does not know the size of tensors statically, so we use
+    // the tensor size from the previous run to allocate tensors for the next
+    // run (following C2 tradition), exploiting the fact that tensor storage
+    // size does not have to match that of real tensor size. The following logic
+    // records the tensor storage size for the next run.
     ms.first = max;
     managed_bytes_ += max;
   }
+
+  // for unmanaged ivalues (either tensor or non-tensor), we reset the *iv so
+  // that the objects pointed to by *iv may be reclaimed by reference counting
   for (auto& iv : unmanaged_ivalues_) {
     *iv = IValue();
   }

diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
@@ -295,16 +295,26 @@ class MemoryPlanner {
   }
 
  private:
+  // ivalues created in one run but not managed by MemoryPlanner
   std::vector<IValue*> unmanaged_ivalues_;
+
   // each pair contains the size (in bytes) of data to be allocated
   // and a vector of StorageImpl's that should be backed by that same data
   // Thus, if memonger is disabled, all vectors are of size 1.
   std::vector<std::pair<size_t, std::vector<c10::StorageImpl*>>>
-      managed_storage_;
+      managed_tensor_storage_;
   size_t managed_bytes_{0};
   size_t reused_tensors_{0};
   at::DataPtr buffer_; // allocated each time we call Run()
 
+  // since output tensors are alive after one inference, their storage
+  // is managed differently (e.g., deallocation happens at client side)
+  // std::vector<std::pair<sizse_t, std::vector<c10::StorageImpl*>>>
+  //     managed_output_storage_;
+  // size_t managed_output_bytes_{0};
+  // size_t reused_output_tensors_{0};
+  // at::DataPtr output_buffer_; // allocated each time we call Run()
+
   static size_t compute_aligned_tensor_size(size_t nbytes);
   static at::DataPtr allocate_buffer(size_t size);
 };

diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
@@ -159,7 +159,7 @@ bool inputsCanRunOutOfPlace(Node* n) {
   return true;
 }
 
-bool canOptimizeConstruct(Node* n) {
+bool isOptimizableContainerType(Node* n) {
   const auto& type = n->output()->type();
   if (type->kind() == TypeKind::ListType) {
     const auto& list_type = type->expectRef<ListType>();
@@ -184,7 +184,7 @@ REGISTER_OPERATOR_FUNCTOR(
     prim_ListConstruct,
     [](Node* n) -> SROperator {
       const auto& type = n->output()->type()->expectRef<ListType>();
-      bool can_optimize = canOptimizeConstruct(n);
+      bool can_optimize = isOptimizableContainerType(n);
       return [can_optimize, &type](ProcessedNode* p_node) {
         const auto& out_l = p_node->Output(0);
         if (!out_l.isNone() && can_optimize) {
@@ -204,7 +204,7 @@ REGISTER_OPERATOR_FUNCTOR(
     prim::TupleConstruct,
     prim_TupleConstruct,
     [](Node* n) -> SROperator {
-      bool can_optimize = canOptimizeConstruct(n);
+      bool can_optimize = isOptimizableContainerType(n);
       return [can_optimize](ProcessedNode* p_node) {
         const auto& out_l = p_node->Output(0);
         if (!out_l.isNone() && can_optimize) {

diff --git a/torch/csrc/jit/runtime/static/ops.h b/torch/csrc/jit/runtime/static/ops.h
@@ -87,7 +87,7 @@ bool opIsRegistered(const c10::Symbol& op_name);
 
 bool canRunOutOfPlace(Node* n);
 bool canReuseInputsOutputs(Node* n);
-bool canOptimizeConstruct(Node* n);
+bool isOptimizableContainerType(Node* n);
 
 std::function<void(ProcessedNode*)> getOutOfPlaceOperation(Node* n);