diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index ea8f6ef6e9081..ef70c8eec7427 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -969,7 +969,7 @@ void StaticRuntime::check_for_memory_leak(bool output_returned) { // check for intermediates if (!ival->isNone()) { TORCH_CHECK( - ival->isTensor() || canOptimizeConstruct(pnode.node()), + ival->isTensor() || isOptimizableContainerType(pnode.node()), error_msg); if (ival->isTensor()) { const auto& t = ival->toTensor(); @@ -989,55 +989,14 @@ void StaticRuntime::check_for_memory_leak(bool output_returned) { } } -MemoryPlanner::MemoryPlanner( - StaticRuntime* runtime, +static void assign_storage_to_managed_tensors( + const StaticRuntime* runtime, + const std::unordered_set& managed_tensor_values, const std::unordered_map>& value_to_same_storage_values, - const std::unordered_set& external_values, - bool out_variants) { - // collect register indices of outputs of ops with out variant - std::unordered_set managed_values; - std::unordered_set unmanaged_ivalues; - for (ProcessedNode& pnode : runtime->nodes()) { - if (canReuseInputsOutputs(pnode.node())) { - for (auto i = 0; i < pnode.outputs().size(); ++i) { - // Types are stored in the underlying TorchScript IR - const Value* out_v = pnode.node()->outputs()[i]; - IValue& out = pnode.Output(i); - const auto& type = out_v->type(); - if (out_variants && !external_values.count(out_v)) { - if (type->cast()) { - managed_values.insert(out_v); - } else if (canOptimizeConstruct(pnode.node())) { - // We "leak" containers of this type - } else { - unmanaged_ivalues.insert(&out); - } - } else { - unmanaged_ivalues.insert(&out); - } - } - } else { - for (auto i = 0; i < pnode.outputs().size(); ++i) { - unmanaged_ivalues.insert(&pnode.Output(i)); - } - } - } - - // remove model outputs from managed_values and unmanaged_ivalues - for (const Value* output : runtime->graph().outputs()) { - managed_values.erase(output); - } - for (IValue* output : runtime->outputs()) { - unmanaged_ivalues.erase(output); - } - - // unmanaged_ivalues => unmanaged_ivalues_ - for (IValue* out : unmanaged_ivalues) { - unmanaged_ivalues_.emplace_back(out); - } - - // map Value to index to managed_storage_, where multiple values can + std::vector>>& + managed_storage) { + // map Value to index to managed_storage, where multiple values can // map to the same index (i.e., sharing the same storage) std::unordered_map value_to_storage_idx; // the StorageImpls of Tensor views should not be managed @@ -1048,7 +1007,7 @@ MemoryPlanner::MemoryPlanner( for (auto i = 0; i < pnode.outputs().size(); ++i) { const auto& ival = pnode.outputs()[i]; const auto* val = pnode.node()->outputs()[i]; - if (managed_values.count(val)) { + if (managed_tensor_values.count(val)) { TORCH_CHECK(ival.isTensor()); auto* impl = ival.toTensor().storage().unsafeGetStorageImpl(); @@ -1058,17 +1017,18 @@ MemoryPlanner::MemoryPlanner( } if (value_to_storage_idx.count(val)) { - managed_storage_[value_to_storage_idx.at(val)].second.emplace_back( + managed_storage[value_to_storage_idx.at(val)].second.emplace_back( impl); } else { auto p = std::make_pair>(0, {impl}); - managed_storage_.emplace_back(std::move(p)); + managed_storage.emplace_back(std::move(p)); // first of a group, update the value_to_storage_idx map with the // index if (value_to_same_storage_values.count(val)) { + auto storage_idx = managed_storage.size() - 1; for (const auto* v : value_to_same_storage_values.at(val)) { - value_to_storage_idx[v] = managed_storage_.size() - 1; + value_to_storage_idx[v] = storage_idx; } } } @@ -1077,6 +1037,73 @@ MemoryPlanner::MemoryPlanner( } } +MemoryPlanner::MemoryPlanner( + StaticRuntime* runtime, + const std::unordered_map>& + value_to_same_storage_values, + const std::unordered_set& external_values, + bool out_variants) { + // collect register indices of outputs of ops with out variant + std::unordered_set managed_tensor_values; + std::unordered_set leaked_values; + if (out_variants) { + for (ProcessedNode& pnode : runtime->nodes()) { + if (canReuseInputsOutputs(pnode.node())) { + for (auto i = 0; i < pnode.outputs().size(); ++i) { + const Value* out_v = pnode.node()->outputs()[i]; + if (external_values.count(out_v)) { + continue; + } + // Types are stored in the underlying TorchScript IR + const auto& type = out_v->type(); + if (type->cast()) { + managed_tensor_values.insert(out_v); + } else if (isOptimizableContainerType(pnode.node())) { + // We "leak" certain container types because their allocations take + // a long time + leaked_values.insert(out_v); + } + } + } + } + } + + // collect unmanaged output ivalues + std::unordered_set unmanaged_ivalues; + for (ProcessedNode& pnode : runtime->nodes()) { + for (auto i = 0; i < pnode.outputs().size(); ++i) { + // Types are stored in the underlying TorchScript IR + const Value* out_v = pnode.node()->outputs()[i]; + if (managed_tensor_values.count(out_v) || leaked_values.count(out_v)) { + continue; + } + IValue& out = pnode.Output(i); + unmanaged_ivalues.insert(&out); + } + } + // since runtime->outputs() escape from run(), remove them from + // managed_tensor_values and from unmanaged_ivalues + for (const Value* output : runtime->graph().outputs()) { + managed_tensor_values.erase(output); + } + for (IValue* output : runtime->outputs()) { + unmanaged_ivalues.erase(output); + } + + // copy to unmanaged_ivalues_ + for (IValue* out : unmanaged_ivalues) { + unmanaged_ivalues_.emplace_back(out); + } + + if (out_variants) { + ::torch::jit::assign_storage_to_managed_tensors( + runtime, + managed_tensor_values, + value_to_same_storage_values, + managed_tensor_storage_); + } +} + // Don't change the size if it is already aligned, otherwise increase the size // to make it aligned. size_t MemoryPlanner::compute_aligned_tensor_size(size_t nbytes) { @@ -1099,7 +1126,7 @@ void MemoryPlanner::allocate() { uint8_t* start = static_cast(buffer_.get()); reused_tensors_ = 0; - for (const auto& ms : managed_storage_) { + for (const auto& ms : managed_tensor_storage_) { auto tensor_size = ms.first; if (tensor_size == 0) { continue; @@ -1125,7 +1152,7 @@ void MemoryPlanner::deallocate() { // free memory used by outputs of ops in out variants // but keep the TensorImpl and StorageImpl around - for (auto& ms : managed_storage_) { + for (auto& ms : managed_tensor_storage_) { const auto& impls = ms.second; size_t max = 0; for (auto& impl : impls) { @@ -1133,9 +1160,17 @@ void MemoryPlanner::deallocate() { impl->reset(); max = std::max(max, current_size); } + // Static runtime does not know the size of tensors statically, so we use + // the tensor size from the previous run to allocate tensors for the next + // run (following C2 tradition), exploiting the fact that tensor storage + // size does not have to match that of real tensor size. The following logic + // records the tensor storage size for the next run. ms.first = max; managed_bytes_ += max; } + + // for unmanaged ivalues (either tensor or non-tensor), we reset the *iv so + // that the objects pointed to by *iv may be reclaimed by reference counting for (auto& iv : unmanaged_ivalues_) { *iv = IValue(); } diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h index 211601689c351..2ce3659e3f5bb 100644 --- a/torch/csrc/jit/runtime/static/impl.h +++ b/torch/csrc/jit/runtime/static/impl.h @@ -295,16 +295,26 @@ class MemoryPlanner { } private: + // ivalues created in one run but not managed by MemoryPlanner std::vector unmanaged_ivalues_; + // each pair contains the size (in bytes) of data to be allocated // and a vector of StorageImpl's that should be backed by that same data // Thus, if memonger is disabled, all vectors are of size 1. std::vector>> - managed_storage_; + managed_tensor_storage_; size_t managed_bytes_{0}; size_t reused_tensors_{0}; at::DataPtr buffer_; // allocated each time we call Run() + // since output tensors are alive after one inference, their storage + // is managed differently (e.g., deallocation happens at client side) + // std::vector>> + // managed_output_storage_; + // size_t managed_output_bytes_{0}; + // size_t reused_output_tensors_{0}; + // at::DataPtr output_buffer_; // allocated each time we call Run() + static size_t compute_aligned_tensor_size(size_t nbytes); static at::DataPtr allocate_buffer(size_t size); }; diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index d748d8c635768..448cf3560d821 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -159,7 +159,7 @@ bool inputsCanRunOutOfPlace(Node* n) { return true; } -bool canOptimizeConstruct(Node* n) { +bool isOptimizableContainerType(Node* n) { const auto& type = n->output()->type(); if (type->kind() == TypeKind::ListType) { const auto& list_type = type->expectRef(); @@ -184,7 +184,7 @@ REGISTER_OPERATOR_FUNCTOR( prim_ListConstruct, [](Node* n) -> SROperator { const auto& type = n->output()->type()->expectRef(); - bool can_optimize = canOptimizeConstruct(n); + bool can_optimize = isOptimizableContainerType(n); return [can_optimize, &type](ProcessedNode* p_node) { const auto& out_l = p_node->Output(0); if (!out_l.isNone() && can_optimize) { @@ -204,7 +204,7 @@ REGISTER_OPERATOR_FUNCTOR( prim::TupleConstruct, prim_TupleConstruct, [](Node* n) -> SROperator { - bool can_optimize = canOptimizeConstruct(n); + bool can_optimize = isOptimizableContainerType(n); return [can_optimize](ProcessedNode* p_node) { const auto& out_l = p_node->Output(0); if (!out_l.isNone() && can_optimize) { diff --git a/torch/csrc/jit/runtime/static/ops.h b/torch/csrc/jit/runtime/static/ops.h index 495c3ae904499..07cbab52a8423 100644 --- a/torch/csrc/jit/runtime/static/ops.h +++ b/torch/csrc/jit/runtime/static/ops.h @@ -87,7 +87,7 @@ bool opIsRegistered(const c10::Symbol& op_name); bool canRunOutOfPlace(Node* n); bool canReuseInputsOutputs(Node* n); -bool canOptimizeConstruct(Node* n); +bool isOptimizableContainerType(Node* n); std::function getOutOfPlaceOperation(Node* n);