Skip to content

Commit

Permalink
Add autograd automatic anomaly detection (pytorch#7677)
Browse files Browse the repository at this point in the history
* add autograd automatic anomaly detection

* python 3 string support

* Fix non python build

* fix typo in doc

* better test and naming fix

* fix no python build and python object handling

* fix missing checks

* clean NO_PYTHON build

* Remove unwanted changes
  • Loading branch information
albanD authored and ezyang committed Jun 12, 2018
1 parent 38362fa commit 78e3259
Show file tree
Hide file tree
Showing 20 changed files with 344 additions and 3 deletions.
7 changes: 7 additions & 0 deletions docs/source/autograd.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,10 @@ and nvprof based (registers both CPU and GPU activity) using
:members:

.. autofunction:: torch.autograd.profiler.load_nvprof

Anomaly detection
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. autoclass:: detect_anomaly

.. autoclass:: set_detect_anomaly
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,6 +737,8 @@ def run(self):
"torch/csrc/autograd/init.cpp",
"torch/csrc/autograd/aten_variable_hooks.cpp",
"torch/csrc/autograd/grad_mode.cpp",
"torch/csrc/autograd/anomaly_mode.cpp",
"torch/csrc/autograd/python_anomaly_mode.cpp",
"torch/csrc/autograd/engine.cpp",
"torch/csrc/autograd/function.cpp",
"torch/csrc/autograd/variable.cpp",
Expand Down
37 changes: 36 additions & 1 deletion test/test_autograd.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from torch.autograd.profiler import profile
from common import TEST_MKL, TestCase, run_tests, skipIfNoLapack, \
suppress_warnings
from torch.autograd import Variable, Function
from torch.autograd import Variable, Function, detect_anomaly
from torch.autograd.function import InplaceFunction
from torch.testing import make_non_contiguous, randn_like

Expand Down Expand Up @@ -2306,6 +2306,41 @@ def test_rnn_backward_to_input_but_not_parameters_cuda(self):
out.sum().backward()
self.assertFalse(s.grad is None or s.grad.abs().sum().item() == 0)

def test_anomaly_detect_nan(self):
size = 10

class MyFunc(Function):
@staticmethod
def forward(ctx, inp1, inp2, fail_0th):
ctx.fail_0th = fail_0th
return inp1.sum(0, keepdim=True)

@staticmethod
def backward(ctx, gO):
gI = gO.clone().expand(size)
gI[0] = 0
gI[0] /= 0 # Generate a nan
if ctx.fail_0th:
return gI, None, None
else:
return None, gI, None

inp = torch.rand(size, requires_grad=True)
out = MyFunc.apply(inp, inp, True)
out.backward() # Should not fail

inp = torch.rand(size, requires_grad=True)
out = MyFunc.apply(inp, inp, True)
with self.assertRaisesRegexp(RuntimeError, "Function 'MyFuncBackward' returned nan values in its 0th output."):
with detect_anomaly():
out.backward()

inp = torch.rand(size, requires_grad=True)
out = MyFunc.apply(inp, inp, False)
with self.assertRaisesRegexp(RuntimeError, "Function 'MyFuncBackward' returned nan values in its 1th output."):
with detect_anomaly():
out.backward()


def index_variable(shape, max_indices):
if not isinstance(shape, tuple):
Expand Down
1 change: 1 addition & 0 deletions tools/cpp_build/libtorch/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ set(TORCH_SRCS
${TORCH_SRC_DIR}/csrc/autograd/profiler.cpp
${TORCH_SRC_DIR}/csrc/autograd/saved_variable.cpp
${TORCH_SRC_DIR}/csrc/autograd/grad_mode.cpp
${TORCH_SRC_DIR}/csrc/autograd/anomaly_mode.cpp
${TORCH_SRC_DIR}/csrc/autograd/function.cpp
${TORCH_SRC_DIR}/csrc/autograd/input_buffer.cpp
${TORCH_SRC_DIR}/csrc/autograd/functions/utils.cpp
Expand Down
1 change: 1 addition & 0 deletions torch/autograd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .function import Function, NestedIOFunction
from .gradcheck import gradcheck, gradgradcheck
from .grad_mode import no_grad, enable_grad, set_grad_enabled
from .anomaly_mode import detect_anomaly, set_detect_anomaly
from . import profiler

__all__ = ['Variable', 'Function', 'backward', 'grad_mode']
Expand Down
99 changes: 99 additions & 0 deletions torch/autograd/anomaly_mode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import torch


class detect_anomaly(object):
r"""Context-manager that enable anomaly detection for the autograd engine.
This does two things:
- Running the forward pass with detection enabled will allow the backward
pass to print the traceback of the forward operation that created the failing
backward function.
- Any backward computation that generate "nan" value will raise an error.
Example:
>>> import torch
>>> from torch import autograd
>>> class MyFunc(autograd.Function):
... @staticmethod
... def forward(ctx, inp):
... return inp.clone()
... @staticmethod
... def backward(ctx, gO):
... # Error during the backward pass
... raise RuntimeError("Some error in backward")
... return gO.clone()
>>> def run_fn(a):
... out = MyFunc.apply(a)
... return out.sum()
>>> inp = torch.rand(10, 10, requires_grad=True)
>>> out = run_fn(inp)
>>> out.backward()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/your/pytorch/install/torch/tensor.py", line 93, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/your/pytorch/install/torch/autograd/__init__.py", line 90, in backward
allow_unreachable=True) # allow_unreachable flag
File "/your/pytorch/install/torch/autograd/function.py", line 76, in apply
return self._forward_cls.backward(self, *args)
File "<stdin>", line 8, in backward
RuntimeError: Some error in backward
>>> with autograd.detect_anomaly():
... inp = torch.rand(10, 10, requires_grad=True)
... out = run_fn(inp)
... out.backward()
Traceback of forward call that caused the error:
File "tmp.py", line 53, in <module>
out = run_fn(inp)
File "tmp.py", line 44, in run_fn
out = MyFunc.apply(a)
Traceback (most recent call last):
File "<stdin>", line 4, in <module>
File "/your/pytorch/install/torch/tensor.py", line 93, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/your/pytorch/install/torch/autograd/__init__.py", line 90, in backward
allow_unreachable=True) # allow_unreachable flag
File "/your/pytorch/install/torch/autograd/function.py", line 76, in apply
return self._forward_cls.backward(self, *args)
File "<stdin>", line 8, in backward
RuntimeError: Some error in backward
"""

def __init__(self):
self.prev = torch.is_anomaly_enabled()

def __enter__(self):
torch.set_anomaly_enabled(True)

def __exit__(self, *args):
torch.set_anomaly_enabled(self.prev)
return False


class set_detect_anomaly(object):
r"""Context-manager that sets the anomaly detection for the autograd engine on or off.
``set_detect_anomaly`` will enable or disable the autograd anomaly detection
based on its argument :attr:`mode`.
It can be used as a context-manager or as a function.
See ``detect_anomaly`` above for details of the anomaly detection behaviour.
Arguments:
mode (bool): Flag whether to enable anomaly detection (``True``),
or disable (``False``).
"""

def __init__(self, mode):
self.prev = torch.is_anomaly_enabled()
torch.set_anomaly_enabled(mode)

def __enter__(self):
pass

def __exit__(self, *args):
torch.set_anomaly_enabled(self.prev)
return False
7 changes: 7 additions & 0 deletions torch/csrc/autograd/anomaly_mode.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#include "torch/csrc/autograd/anomaly_mode.h"

namespace torch { namespace autograd {

bool AnomalyMode::_enabled = 0;

}}
23 changes: 23 additions & 0 deletions torch/csrc/autograd/anomaly_mode.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#pragma once

namespace torch { namespace autograd {

struct AnomalyMode {
static bool is_enabled() {
return _enabled;
}
static void set_enabled(bool enabled) {
_enabled = enabled;
}

private:
static bool _enabled;
};


struct AnomalyMetadata {
virtual void store_stack() = 0;
virtual void print_stack() = 0;
};

}}
18 changes: 18 additions & 0 deletions torch/csrc/autograd/engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "torch/csrc/autograd/function.h"
#include "torch/csrc/autograd/functions/basic_ops.h"
#include "torch/csrc/autograd/grad_mode.h"
#include "torch/csrc/autograd/anomaly_mode.h"
#include "torch/csrc/autograd/variable.h"
#include "torch/csrc/utils/auto_gpu.h"

Expand Down Expand Up @@ -269,6 +270,9 @@ auto Engine::thread_main(GraphTask *graph_task) -> void {
auto Engine::thread_on_exception(FunctionTask& task, std::exception& e) -> void {
std::lock_guard<std::mutex> lock(task.base->mutex);
if (!task.base->has_error.load()) {
if (AnomalyMode::is_enabled()) {
task.fn->metadata()->print_stack();
}
task.base->exception = std::current_exception();
task.base->has_error = true;
}
Expand Down Expand Up @@ -373,6 +377,20 @@ auto Engine::evaluate_function(FunctionTask& task) -> void {

int num_outputs = outputs.size();
if (num_outputs == 0) return; // Don't even acquire the mutex

if (AnomalyMode::is_enabled()) {
AutoGradMode grad_mode(false);
for (int i = 0; i < num_outputs; ++i) {
auto& output = outputs[i];
AutoGPU guard(output);
if (output.defined() && output.ne(output).any().toCByte()) {
std::stringstream ss;
ss << "Function '" << fn.name() << "' returned nan values in its " << i << "th output.";
throw std::runtime_error(ss.str());
}
}
}

std::lock_guard<std::mutex> lock(task.base->mutex);
for (int i = 0; i < num_outputs; ++i) {
auto& output = outputs[i];
Expand Down
4 changes: 4 additions & 0 deletions torch/csrc/autograd/engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "torch/csrc/autograd/function.h"
#include "torch/csrc/autograd/input_buffer.h"
#include "torch/csrc/autograd/anomaly_mode.h"

#include <deque>
#include <exception>
Expand Down Expand Up @@ -41,6 +42,9 @@ struct Engine {
bool keep_graph,
bool create_graph,
const edge_list& outputs = {});
virtual std::unique_ptr<AnomalyMetadata> make_anomaly_metadata() {
return nullptr;
}

void queue_callback(std::function<void()> callback);

Expand Down
8 changes: 8 additions & 0 deletions torch/csrc/autograd/function.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "torch/csrc/autograd/function.h"

#include "torch/csrc/autograd/engine.h"
#include "torch/csrc/autograd/functions/special.h"
#include "torch/csrc/autograd/variable.h"
#include "torch/csrc/jit/ir.h"
Expand Down Expand Up @@ -99,6 +100,13 @@ void Function::set_up_context_edge(
backward_eval->forward_ctx_select = ctx_select;
}

AnomalyMetadata* Function::metadata() noexcept {
if (!anomaly_metadata_) {
anomaly_metadata_ = Engine::get_default_engine().make_anomaly_metadata();
}
return anomaly_metadata_.get();
}

/*
* Fix for #5534: prevent stack overflow on deletion of deep computation graph
*
Expand Down
12 changes: 11 additions & 1 deletion torch/csrc/autograd/function.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "torch/csrc/assertions.h"
#include "torch/csrc/autograd/edge.h"
#include "torch/csrc/autograd/grad_mode.h"
#include "torch/csrc/autograd/anomaly_mode.h"
#include "torch/csrc/autograd/profiler.h"
#include "torch/csrc/autograd/saved_variable.h"
#include "torch/csrc/autograd/type_and_shape.h"
Expand Down Expand Up @@ -95,7 +96,11 @@ struct Function : std::enable_shared_from_this<Function> {
uint64_t sequence_nr,
edge_list&& next_edges = edge_list())
: sequence_nr_(sequence_nr),
next_edges_(std::move(next_edges)) {}
next_edges_(std::move(next_edges)) {
if (AnomalyMode::is_enabled()) {
metadata()->store_stack();
}
}

explicit Function(
edge_list&& next_edges = edge_list())
Expand Down Expand Up @@ -236,6 +241,10 @@ struct Function : std::enable_shared_from_this<Function> {
pyobj_ = pyobj;
}

/// Returns the anomaly metadata stored for this `Function`.
/// If none exist, creates a new empty one.
AnomalyMetadata* metadata() noexcept;

/// Create a context edge for the JIT.
static void set_up_context_edge(
jit::Node* this_node,
Expand Down Expand Up @@ -329,6 +338,7 @@ struct Function : std::enable_shared_from_this<Function> {

edge_list next_edges_;
PyObject* pyobj_ = nullptr; // weak reference
std::unique_ptr<AnomalyMetadata> anomaly_metadata_ = nullptr;
std::vector<std::unique_ptr<FunctionPreHook>> pre_hooks_;
std::vector<std::unique_ptr<FunctionPostHook>> post_hooks_;
auto_unique_ptr<jit::tracer::FunctionTracingState> tracing_state_;
Expand Down
22 changes: 22 additions & 0 deletions torch/csrc/autograd/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,32 @@ static PyObject * is_grad_enabled(PyObject* _unused, PyObject *arg) {
END_HANDLE_TH_ERRORS
}

static PyObject * set_anomaly_mode_enabled(PyObject* _unused, PyObject *arg) {
HANDLE_TH_ERRORS
if (!PyBool_Check(arg)) {
throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
}
AnomalyMode::set_enabled(arg == Py_True);
Py_RETURN_NONE;
END_HANDLE_TH_ERRORS
}

static PyObject * is_anomaly_mode_enabled(PyObject* _unused, PyObject *arg) {
HANDLE_TH_ERRORS
if (AnomalyMode::is_enabled()) {
Py_RETURN_TRUE;
} else {
Py_RETURN_FALSE;
}
END_HANDLE_TH_ERRORS
}

// autograd methods on torch._C
static PyMethodDef methods[] = {
{"set_grad_enabled", (PyCFunction)set_grad_enabled, METH_O, nullptr},
{"is_grad_enabled", (PyCFunction)is_grad_enabled, METH_NOARGS, nullptr},
{"set_anomaly_enabled", (PyCFunction)set_anomaly_mode_enabled, METH_O, nullptr},
{"is_anomaly_enabled", (PyCFunction)is_anomaly_mode_enabled, METH_NOARGS, nullptr},
{nullptr, nullptr, 0, nullptr}
};

Expand Down
Loading

0 comments on commit 78e3259

Please sign in to comment.