Add autograd automatic anomaly detection (pytorch#7677)

* add autograd automatic anomaly detection * python 3 string support * Fix non python build * fix typo in doc * better test and naming fix * fix no python build and python object handling * fix missing checks * clean NO_PYTHON build * Remove unwanted changes
wellneck · Jun 12, 2018 · 78e3259 · 78e3259
1 parent 38362fa
commit 78e3259
Show file tree

Hide file tree

Showing 20 changed files with 344 additions and 3 deletions.
diff --git a/docs/source/autograd.rst b/docs/source/autograd.rst
@@ -98,3 +98,10 @@ and nvprof based (registers both CPU and GPU activity) using
     :members:
 
 .. autofunction:: torch.autograd.profiler.load_nvprof
+
+Anomaly detection
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: detect_anomaly
+
+.. autoclass:: set_detect_anomaly
diff --git a/setup.py b/setup.py
@@ -737,6 +737,8 @@ def run(self):
     "torch/csrc/autograd/init.cpp",
     "torch/csrc/autograd/aten_variable_hooks.cpp",
     "torch/csrc/autograd/grad_mode.cpp",
+    "torch/csrc/autograd/anomaly_mode.cpp",
+    "torch/csrc/autograd/python_anomaly_mode.cpp",
     "torch/csrc/autograd/engine.cpp",
     "torch/csrc/autograd/function.cpp",
     "torch/csrc/autograd/variable.cpp",

diff --git a/test/test_autograd.py b/test/test_autograd.py
@@ -15,7 +15,7 @@
 from torch.autograd.profiler import profile
 from common import TEST_MKL, TestCase, run_tests, skipIfNoLapack, \
     suppress_warnings
-from torch.autograd import Variable, Function
+from torch.autograd import Variable, Function, detect_anomaly
 from torch.autograd.function import InplaceFunction
 from torch.testing import make_non_contiguous, randn_like
 
@@ -2306,6 +2306,41 @@ def test_rnn_backward_to_input_but_not_parameters_cuda(self):
         out.sum().backward()
         self.assertFalse(s.grad is None or s.grad.abs().sum().item() == 0)
 
+    def test_anomaly_detect_nan(self):
+        size = 10
+
+        class MyFunc(Function):
+            @staticmethod
+            def forward(ctx, inp1, inp2, fail_0th):
+                ctx.fail_0th = fail_0th
+                return inp1.sum(0, keepdim=True)
+
+            @staticmethod
+            def backward(ctx, gO):
+                gI = gO.clone().expand(size)
+                gI[0] = 0
+                gI[0] /= 0  # Generate a nan
+                if ctx.fail_0th:
+                    return gI, None, None
+                else:
+                    return None, gI, None
+
+        inp = torch.rand(size, requires_grad=True)
+        out = MyFunc.apply(inp, inp, True)
+        out.backward()  # Should not fail
+
+        inp = torch.rand(size, requires_grad=True)
+        out = MyFunc.apply(inp, inp, True)
+        with self.assertRaisesRegexp(RuntimeError, "Function 'MyFuncBackward' returned nan values in its 0th output."):
+            with detect_anomaly():
+                out.backward()
+
+        inp = torch.rand(size, requires_grad=True)
+        out = MyFunc.apply(inp, inp, False)
+        with self.assertRaisesRegexp(RuntimeError, "Function 'MyFuncBackward' returned nan values in its 1th output."):
+            with detect_anomaly():
+                out.backward()
+
 
 def index_variable(shape, max_indices):
     if not isinstance(shape, tuple):

diff --git a/tools/cpp_build/libtorch/CMakeLists.txt b/tools/cpp_build/libtorch/CMakeLists.txt
@@ -201,6 +201,7 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/autograd/profiler.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/saved_variable.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/grad_mode.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/anomaly_mode.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/function.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/input_buffer.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/functions/utils.cpp

diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
@@ -11,6 +11,7 @@
 from .function import Function, NestedIOFunction
 from .gradcheck import gradcheck, gradgradcheck
 from .grad_mode import no_grad, enable_grad, set_grad_enabled
+from .anomaly_mode import detect_anomaly, set_detect_anomaly
 from . import profiler
 
 __all__ = ['Variable', 'Function', 'backward', 'grad_mode']

diff --git a/torch/autograd/anomaly_mode.py b/torch/autograd/anomaly_mode.py
@@ -0,0 +1,99 @@
+import torch
+
+
+class detect_anomaly(object):
+    r"""Context-manager that enable anomaly detection for the autograd engine.
+
+    This does two things:
+    - Running the forward pass with detection enabled will allow the backward
+    pass to print the traceback of the forward operation that created the failing
+    backward function.
+    - Any backward computation that generate "nan" value will raise an error.
+
+    Example:
+
+        >>> import torch
+        >>> from torch import autograd
+        >>> class MyFunc(autograd.Function):
+        ...     @staticmethod
+        ...     def forward(ctx, inp):
+        ...         return inp.clone()
+        ...     @staticmethod
+        ...     def backward(ctx, gO):
+        ...         # Error during the backward pass
+        ...         raise RuntimeError("Some error in backward")
+        ...         return gO.clone()
+        >>> def run_fn(a):
+        ...     out = MyFunc.apply(a)
+        ...     return out.sum()
+        >>> inp = torch.rand(10, 10, requires_grad=True)
+        >>> out = run_fn(inp)
+        >>> out.backward()
+            Traceback (most recent call last):
+              File "<stdin>", line 1, in <module>
+              File "/your/pytorch/install/torch/tensor.py", line 93, in backward
+                torch.autograd.backward(self, gradient, retain_graph, create_graph)
+              File "/your/pytorch/install/torch/autograd/__init__.py", line 90, in backward
+                allow_unreachable=True)  # allow_unreachable flag
+              File "/your/pytorch/install/torch/autograd/function.py", line 76, in apply
+                return self._forward_cls.backward(self, *args)
+              File "<stdin>", line 8, in backward
+            RuntimeError: Some error in backward
+        >>> with autograd.detect_anomaly():
+        ...     inp = torch.rand(10, 10, requires_grad=True)
+        ...     out = run_fn(inp)
+        ...     out.backward()
+            Traceback of forward call that caused the error:
+              File "tmp.py", line 53, in <module>
+                out = run_fn(inp)
+              File "tmp.py", line 44, in run_fn
+                out = MyFunc.apply(a)
+            Traceback (most recent call last):
+              File "<stdin>", line 4, in <module>
+              File "/your/pytorch/install/torch/tensor.py", line 93, in backward
+                torch.autograd.backward(self, gradient, retain_graph, create_graph)
+              File "/your/pytorch/install/torch/autograd/__init__.py", line 90, in backward
+                allow_unreachable=True)  # allow_unreachable flag
+              File "/your/pytorch/install/torch/autograd/function.py", line 76, in apply
+                return self._forward_cls.backward(self, *args)
+              File "<stdin>", line 8, in backward
+            RuntimeError: Some error in backward
+
+    """
+
+    def __init__(self):
+        self.prev = torch.is_anomaly_enabled()
+
+    def __enter__(self):
+        torch.set_anomaly_enabled(True)
+
+    def __exit__(self, *args):
+        torch.set_anomaly_enabled(self.prev)
+        return False
+
+
+class set_detect_anomaly(object):
+    r"""Context-manager that sets the anomaly detection for the autograd engine on or off.
+
+    ``set_detect_anomaly`` will enable or disable the autograd anomaly detection
+    based on its argument :attr:`mode`.
+    It can be used as a context-manager or as a function.
+
+    See ``detect_anomaly`` above for details of the anomaly detection behaviour.
+
+    Arguments:
+        mode (bool): Flag whether to enable anomaly detection (``True``),
+                     or disable (``False``).
+
+    """
+
+    def __init__(self, mode):
+        self.prev = torch.is_anomaly_enabled()
+        torch.set_anomaly_enabled(mode)
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, *args):
+        torch.set_anomaly_enabled(self.prev)
+        return False
diff --git a/torch/csrc/autograd/anomaly_mode.cpp b/torch/csrc/autograd/anomaly_mode.cpp
@@ -0,0 +1,7 @@
+#include "torch/csrc/autograd/anomaly_mode.h"
+
+namespace torch { namespace autograd {
+
+bool AnomalyMode::_enabled = 0;
+
+}}
diff --git a/torch/csrc/autograd/anomaly_mode.h b/torch/csrc/autograd/anomaly_mode.h
@@ -0,0 +1,23 @@
+#pragma once
+
+namespace torch { namespace autograd {
+
+struct AnomalyMode {
+  static bool is_enabled() {
+    return _enabled;
+  }
+  static void set_enabled(bool enabled) {
+    _enabled = enabled;
+  }
+
+private:
+  static bool _enabled;
+};
+
+
+struct AnomalyMetadata {
+  virtual void store_stack() = 0;
+  virtual void print_stack() = 0;
+};
+
+}}
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
@@ -3,6 +3,7 @@
 #include "torch/csrc/autograd/function.h"
 #include "torch/csrc/autograd/functions/basic_ops.h"
 #include "torch/csrc/autograd/grad_mode.h"
+#include "torch/csrc/autograd/anomaly_mode.h"
 #include "torch/csrc/autograd/variable.h"
 #include "torch/csrc/utils/auto_gpu.h"
 
@@ -269,6 +270,9 @@ auto Engine::thread_main(GraphTask *graph_task) -> void {
 auto Engine::thread_on_exception(FunctionTask& task, std::exception& e) -> void {
   std::lock_guard<std::mutex> lock(task.base->mutex);
   if (!task.base->has_error.load()) {
+    if (AnomalyMode::is_enabled()) {
+      task.fn->metadata()->print_stack();
+    }
     task.base->exception = std::current_exception();
     task.base->has_error = true;
   }
@@ -373,6 +377,20 @@ auto Engine::evaluate_function(FunctionTask& task) -> void {
 
   int num_outputs = outputs.size();
   if (num_outputs == 0) return; // Don't even acquire the mutex
+
+  if (AnomalyMode::is_enabled()) {
+    AutoGradMode grad_mode(false);
+    for (int i = 0; i < num_outputs; ++i) {
+      auto& output = outputs[i];
+      AutoGPU guard(output);
+      if (output.defined() && output.ne(output).any().toCByte()) {
+        std::stringstream ss;
+        ss << "Function '" << fn.name() << "' returned nan values in its " << i << "th output.";
+        throw std::runtime_error(ss.str());
+      }
+    }
+  }
+
   std::lock_guard<std::mutex> lock(task.base->mutex);
   for (int i = 0; i < num_outputs; ++i) {
     auto& output = outputs[i];

diff --git a/torch/csrc/autograd/engine.h b/torch/csrc/autograd/engine.h
@@ -5,6 +5,7 @@
 
 #include "torch/csrc/autograd/function.h"
 #include "torch/csrc/autograd/input_buffer.h"
+#include "torch/csrc/autograd/anomaly_mode.h"
 
 #include <deque>
 #include <exception>
@@ -41,6 +42,9 @@ struct Engine {
       bool keep_graph,
       bool create_graph,
       const edge_list& outputs = {});
+  virtual std::unique_ptr<AnomalyMetadata> make_anomaly_metadata() {
+    return nullptr;
+  }
 
   void queue_callback(std::function<void()> callback);
 

diff --git a/torch/csrc/autograd/function.cpp b/torch/csrc/autograd/function.cpp
@@ -1,5 +1,6 @@
 #include "torch/csrc/autograd/function.h"
 
+#include "torch/csrc/autograd/engine.h"
 #include "torch/csrc/autograd/functions/special.h"
 #include "torch/csrc/autograd/variable.h"
 #include "torch/csrc/jit/ir.h"
@@ -99,6 +100,13 @@ void Function::set_up_context_edge(
     backward_eval->forward_ctx_select = ctx_select;
 }
 
+AnomalyMetadata* Function::metadata() noexcept {
+  if (!anomaly_metadata_) {
+    anomaly_metadata_ = Engine::get_default_engine().make_anomaly_metadata();
+  }
+  return anomaly_metadata_.get();
+}
+
 /*
  * Fix for #5534: prevent stack overflow on deletion of deep computation graph
  *

diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
@@ -3,6 +3,7 @@
 #include "torch/csrc/assertions.h"
 #include "torch/csrc/autograd/edge.h"
 #include "torch/csrc/autograd/grad_mode.h"
+#include "torch/csrc/autograd/anomaly_mode.h"
 #include "torch/csrc/autograd/profiler.h"
 #include "torch/csrc/autograd/saved_variable.h"
 #include "torch/csrc/autograd/type_and_shape.h"
@@ -95,7 +96,11 @@ struct Function : std::enable_shared_from_this<Function> {
       uint64_t sequence_nr,
       edge_list&& next_edges = edge_list())
       : sequence_nr_(sequence_nr),
-      next_edges_(std::move(next_edges)) {}
+      next_edges_(std::move(next_edges)) {
+    if (AnomalyMode::is_enabled()) {
+      metadata()->store_stack();
+    }
+  }
 
   explicit Function(
       edge_list&& next_edges = edge_list())
@@ -236,6 +241,10 @@ struct Function : std::enable_shared_from_this<Function> {
     pyobj_ = pyobj;
   }
 
+  /// Returns the anomaly metadata stored for this `Function`.
+  /// If none exist, creates a new empty one.
+  AnomalyMetadata* metadata() noexcept;
+
   /// Create a context edge for the JIT.
   static void set_up_context_edge(
       jit::Node* this_node,
@@ -329,6 +338,7 @@ struct Function : std::enable_shared_from_this<Function> {
 
   edge_list next_edges_;
   PyObject* pyobj_ = nullptr; // weak reference
+  std::unique_ptr<AnomalyMetadata> anomaly_metadata_ = nullptr;
   std::vector<std::unique_ptr<FunctionPreHook>> pre_hooks_;
   std::vector<std::unique_ptr<FunctionPostHook>> post_hooks_;
   auto_unique_ptr<jit::tracer::FunctionTracingState> tracing_state_;

diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
@@ -77,10 +77,32 @@ static PyObject * is_grad_enabled(PyObject* _unused, PyObject *arg) {
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject * set_anomaly_mode_enabled(PyObject* _unused, PyObject *arg) {
+  HANDLE_TH_ERRORS
+  if (!PyBool_Check(arg)) {
+    throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
+  }
+  AnomalyMode::set_enabled(arg == Py_True);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * is_anomaly_mode_enabled(PyObject* _unused, PyObject *arg) {
+  HANDLE_TH_ERRORS
+  if (AnomalyMode::is_enabled()) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
 // autograd methods on torch._C
 static PyMethodDef methods[] = {
   {"set_grad_enabled", (PyCFunction)set_grad_enabled, METH_O, nullptr},
   {"is_grad_enabled", (PyCFunction)is_grad_enabled, METH_NOARGS, nullptr},
+  {"set_anomaly_enabled", (PyCFunction)set_anomaly_mode_enabled, METH_O, nullptr},
+  {"is_anomaly_enabled", (PyCFunction)is_anomaly_mode_enabled, METH_NOARGS, nullptr},
   {nullptr, nullptr, 0, nullptr}
 };