Skip to content

Commit

Permalink
[Static runtime] refactor MemoryPlanner codes to prepare for output t…
Browse files Browse the repository at this point in the history
…ensor memory planning (pytorch#55809)

Summary:
Pull Request resolved: pytorch#55809

[Static runtime] refactor MemoryPlanner codes to prepare for output tensor memory planning

Test Plan: buck test mode/dev //caffe2/caffe2/fb/predictor:pytorch_predictor_test -- --exact 'caffe2/caffe2/fb/predictor:pytorch_predictor_test - PyTorchPredictor.StaticRuntime'

Reviewed By: bwasti

Differential Revision: D27411416

fbshipit-source-id: 7dae7c2586ce3b4ebacf6169017140166c30e99c
  • Loading branch information
Peng Wu authored and facebook-github-bot committed Apr 13, 2021
1 parent 6269efd commit 18662d4
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 59 deletions.
143 changes: 89 additions & 54 deletions torch/csrc/jit/runtime/static/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -969,7 +969,7 @@ void StaticRuntime::check_for_memory_leak(bool output_returned) {
// check for intermediates
if (!ival->isNone()) {
TORCH_CHECK(
ival->isTensor() || canOptimizeConstruct(pnode.node()),
ival->isTensor() || isOptimizableContainerType(pnode.node()),
error_msg);
if (ival->isTensor()) {
const auto& t = ival->toTensor();
Expand All @@ -989,55 +989,14 @@ void StaticRuntime::check_for_memory_leak(bool output_returned) {
}
}

MemoryPlanner::MemoryPlanner(
StaticRuntime* runtime,
static void assign_storage_to_managed_tensors(
const StaticRuntime* runtime,
const std::unordered_set<const Value*>& managed_tensor_values,
const std::unordered_map<const Value*, std::vector<const Value*>>&
value_to_same_storage_values,
const std::unordered_set<const Value*>& external_values,
bool out_variants) {
// collect register indices of outputs of ops with out variant
std::unordered_set<const Value*> managed_values;
std::unordered_set<IValue*> unmanaged_ivalues;
for (ProcessedNode& pnode : runtime->nodes()) {
if (canReuseInputsOutputs(pnode.node())) {
for (auto i = 0; i < pnode.outputs().size(); ++i) {
// Types are stored in the underlying TorchScript IR
const Value* out_v = pnode.node()->outputs()[i];
IValue& out = pnode.Output(i);
const auto& type = out_v->type();
if (out_variants && !external_values.count(out_v)) {
if (type->cast<TensorType>()) {
managed_values.insert(out_v);
} else if (canOptimizeConstruct(pnode.node())) {
// We "leak" containers of this type
} else {
unmanaged_ivalues.insert(&out);
}
} else {
unmanaged_ivalues.insert(&out);
}
}
} else {
for (auto i = 0; i < pnode.outputs().size(); ++i) {
unmanaged_ivalues.insert(&pnode.Output(i));
}
}
}

// remove model outputs from managed_values and unmanaged_ivalues
for (const Value* output : runtime->graph().outputs()) {
managed_values.erase(output);
}
for (IValue* output : runtime->outputs()) {
unmanaged_ivalues.erase(output);
}

// unmanaged_ivalues => unmanaged_ivalues_
for (IValue* out : unmanaged_ivalues) {
unmanaged_ivalues_.emplace_back(out);
}

// map Value to index to managed_storage_, where multiple values can
std::vector<std::pair<size_t, std::vector<c10::StorageImpl*>>>&
managed_storage) {
// map Value to index to managed_storage, where multiple values can
// map to the same index (i.e., sharing the same storage)
std::unordered_map<const Value*, size_t> value_to_storage_idx;
// the StorageImpls of Tensor views should not be managed
Expand All @@ -1048,7 +1007,7 @@ MemoryPlanner::MemoryPlanner(
for (auto i = 0; i < pnode.outputs().size(); ++i) {
const auto& ival = pnode.outputs()[i];
const auto* val = pnode.node()->outputs()[i];
if (managed_values.count(val)) {
if (managed_tensor_values.count(val)) {
TORCH_CHECK(ival.isTensor());
auto* impl = ival.toTensor().storage().unsafeGetStorageImpl();

Expand All @@ -1058,17 +1017,18 @@ MemoryPlanner::MemoryPlanner(
}

if (value_to_storage_idx.count(val)) {
managed_storage_[value_to_storage_idx.at(val)].second.emplace_back(
managed_storage[value_to_storage_idx.at(val)].second.emplace_back(
impl);
} else {
auto p =
std::make_pair<size_t, std::vector<c10::StorageImpl*>>(0, {impl});
managed_storage_.emplace_back(std::move(p));
managed_storage.emplace_back(std::move(p));
// first of a group, update the value_to_storage_idx map with the
// index
if (value_to_same_storage_values.count(val)) {
auto storage_idx = managed_storage.size() - 1;
for (const auto* v : value_to_same_storage_values.at(val)) {
value_to_storage_idx[v] = managed_storage_.size() - 1;
value_to_storage_idx[v] = storage_idx;
}
}
}
Expand All @@ -1077,6 +1037,73 @@ MemoryPlanner::MemoryPlanner(
}
}

MemoryPlanner::MemoryPlanner(
StaticRuntime* runtime,
const std::unordered_map<const Value*, std::vector<const Value*>>&
value_to_same_storage_values,
const std::unordered_set<const Value*>& external_values,
bool out_variants) {
// collect register indices of outputs of ops with out variant
std::unordered_set<const Value*> managed_tensor_values;
std::unordered_set<const Value*> leaked_values;
if (out_variants) {
for (ProcessedNode& pnode : runtime->nodes()) {
if (canReuseInputsOutputs(pnode.node())) {
for (auto i = 0; i < pnode.outputs().size(); ++i) {
const Value* out_v = pnode.node()->outputs()[i];
if (external_values.count(out_v)) {
continue;
}
// Types are stored in the underlying TorchScript IR
const auto& type = out_v->type();
if (type->cast<TensorType>()) {
managed_tensor_values.insert(out_v);
} else if (isOptimizableContainerType(pnode.node())) {
// We "leak" certain container types because their allocations take
// a long time
leaked_values.insert(out_v);
}
}
}
}
}

// collect unmanaged output ivalues
std::unordered_set<IValue*> unmanaged_ivalues;
for (ProcessedNode& pnode : runtime->nodes()) {
for (auto i = 0; i < pnode.outputs().size(); ++i) {
// Types are stored in the underlying TorchScript IR
const Value* out_v = pnode.node()->outputs()[i];
if (managed_tensor_values.count(out_v) || leaked_values.count(out_v)) {
continue;
}
IValue& out = pnode.Output(i);
unmanaged_ivalues.insert(&out);
}
}
// since runtime->outputs() escape from run(), remove them from
// managed_tensor_values and from unmanaged_ivalues
for (const Value* output : runtime->graph().outputs()) {
managed_tensor_values.erase(output);
}
for (IValue* output : runtime->outputs()) {
unmanaged_ivalues.erase(output);
}

// copy to unmanaged_ivalues_
for (IValue* out : unmanaged_ivalues) {
unmanaged_ivalues_.emplace_back(out);
}

if (out_variants) {
::torch::jit::assign_storage_to_managed_tensors(
runtime,
managed_tensor_values,
value_to_same_storage_values,
managed_tensor_storage_);
}
}

// Don't change the size if it is already aligned, otherwise increase the size
// to make it aligned.
size_t MemoryPlanner::compute_aligned_tensor_size(size_t nbytes) {
Expand All @@ -1099,7 +1126,7 @@ void MemoryPlanner::allocate() {
uint8_t* start = static_cast<uint8_t*>(buffer_.get());

reused_tensors_ = 0;
for (const auto& ms : managed_storage_) {
for (const auto& ms : managed_tensor_storage_) {
auto tensor_size = ms.first;
if (tensor_size == 0) {
continue;
Expand All @@ -1125,17 +1152,25 @@ void MemoryPlanner::deallocate() {

// free memory used by outputs of ops in out variants
// but keep the TensorImpl and StorageImpl around
for (auto& ms : managed_storage_) {
for (auto& ms : managed_tensor_storage_) {
const auto& impls = ms.second;
size_t max = 0;
for (auto& impl : impls) {
size_t current_size = compute_aligned_tensor_size(impl->nbytes());
impl->reset();
max = std::max(max, current_size);
}
// Static runtime does not know the size of tensors statically, so we use
// the tensor size from the previous run to allocate tensors for the next
// run (following C2 tradition), exploiting the fact that tensor storage
// size does not have to match that of real tensor size. The following logic
// records the tensor storage size for the next run.
ms.first = max;
managed_bytes_ += max;
}

// for unmanaged ivalues (either tensor or non-tensor), we reset the *iv so
// that the objects pointed to by *iv may be reclaimed by reference counting
for (auto& iv : unmanaged_ivalues_) {
*iv = IValue();
}
Expand Down
12 changes: 11 additions & 1 deletion torch/csrc/jit/runtime/static/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -295,16 +295,26 @@ class MemoryPlanner {
}

private:
// ivalues created in one run but not managed by MemoryPlanner
std::vector<IValue*> unmanaged_ivalues_;

// each pair contains the size (in bytes) of data to be allocated
// and a vector of StorageImpl's that should be backed by that same data
// Thus, if memonger is disabled, all vectors are of size 1.
std::vector<std::pair<size_t, std::vector<c10::StorageImpl*>>>
managed_storage_;
managed_tensor_storage_;
size_t managed_bytes_{0};
size_t reused_tensors_{0};
at::DataPtr buffer_; // allocated each time we call Run()

// since output tensors are alive after one inference, their storage
// is managed differently (e.g., deallocation happens at client side)
// std::vector<std::pair<sizse_t, std::vector<c10::StorageImpl*>>>
// managed_output_storage_;
// size_t managed_output_bytes_{0};
// size_t reused_output_tensors_{0};
// at::DataPtr output_buffer_; // allocated each time we call Run()

static size_t compute_aligned_tensor_size(size_t nbytes);
static at::DataPtr allocate_buffer(size_t size);
};
Expand Down
6 changes: 3 additions & 3 deletions torch/csrc/jit/runtime/static/ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ bool inputsCanRunOutOfPlace(Node* n) {
return true;
}

bool canOptimizeConstruct(Node* n) {
bool isOptimizableContainerType(Node* n) {
const auto& type = n->output()->type();
if (type->kind() == TypeKind::ListType) {
const auto& list_type = type->expectRef<ListType>();
Expand All @@ -184,7 +184,7 @@ REGISTER_OPERATOR_FUNCTOR(
prim_ListConstruct,
[](Node* n) -> SROperator {
const auto& type = n->output()->type()->expectRef<ListType>();
bool can_optimize = canOptimizeConstruct(n);
bool can_optimize = isOptimizableContainerType(n);
return [can_optimize, &type](ProcessedNode* p_node) {
const auto& out_l = p_node->Output(0);
if (!out_l.isNone() && can_optimize) {
Expand All @@ -204,7 +204,7 @@ REGISTER_OPERATOR_FUNCTOR(
prim::TupleConstruct,
prim_TupleConstruct,
[](Node* n) -> SROperator {
bool can_optimize = canOptimizeConstruct(n);
bool can_optimize = isOptimizableContainerType(n);
return [can_optimize](ProcessedNode* p_node) {
const auto& out_l = p_node->Output(0);
if (!out_l.isNone() && can_optimize) {
Expand Down
2 changes: 1 addition & 1 deletion torch/csrc/jit/runtime/static/ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ bool opIsRegistered(const c10::Symbol& op_name);

bool canRunOutOfPlace(Node* n);
bool canReuseInputsOutputs(Node* n);
bool canOptimizeConstruct(Node* n);
bool isOptimizableContainerType(Node* n);

std::function<void(ProcessedNode*)> getOutOfPlaceOperation(Node* n);

Expand Down

0 comments on commit 18662d4

Please sign in to comment.