Merge pull request PaddlePaddle#14284 from PaddlePaddle/revert-14043-…

…conv_cudnn_cache Revert " Exhaustive search for cuDNN conv."
chengduoZH · Nov 7, 2018 · 0953cd3 · 0953cd3
2 parents ce7d9b0 + db8c52d
commit 0953cd3
Show file tree

Hide file tree

Showing 14 changed files with 74 additions and 381 deletions.
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <algorithm>
 #include <array>
 #include <string>
 #include <vector>

diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
@@ -13,8 +13,6 @@
 // limitations under the License.
 
 #pragma once
-#include <algorithm>
-#include <map>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/naive_executor.h"

diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
@@ -16,14 +16,13 @@
 
 #include <glog/logging.h>
 #include <sys/time.h>
-#include <algorithm>
 #include <chrono>  // NOLINT
 #include <numeric>
 #include <sstream>
 #include <string>
 #include <vector>
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle_inference_api.h"
 
 namespace paddle {
 namespace inference {

diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
@@ -59,8 +59,7 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) {
 bool IsPersistable(const framework::VarDesc* var) {
   if (var->Persistable() &&
       var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
-      var->GetType() != framework::proto::VarType::FETCH_LIST &&
-      var->GetType() != framework::proto::VarType::RAW) {
+      var->GetType() != framework::proto::VarType::FETCH_LIST) {
     return true;
   }
   return false;

diff --git a/paddle/fluid/operators/add_position_encoding_op.h b/paddle/fluid/operators/add_position_encoding_op.h
@@ -66,10 +66,9 @@ class AddPositionEncodingKernel : public framework::OpKernel<T> {
           x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i];
       for (int j = 0; j < max_length; ++j) {
         for (int k = 0; k < half_size; ++k) {
-          const double val =
-              (half_size > 1)
-                  ? j / pow(10000.0, static_cast<double>(k) / (half_size - 1))
-                  : j / 10000.0;
+          const double val = (half_size > 1)
+                                 ? j / pow(10000.0, double(k) / (half_size - 1))
+                                 : j / 10000.0;
           dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta;
           dst_ptr[half_size + k] =
               src_ptr[half_size + k] * alpha + cos(val) * beta;

diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -15,22 +15,15 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/profiler.h"
 
 DEFINE_bool(cudnn_deterministic, false,
             "Whether allow using an autotuning algorithm for convolution "
             "operator. The autotuning algorithm may be non-deterministic. If "
             "true, the algorithm is deterministic.");
-DEFINE_uint64(conv_workspace_size_limit, 4096,
-              "cuDNN convolution workspace limit in MB unit.");
-DEFINE_bool(cudnn_exhaustive_search, false,
-            "Whether enable exhaustive search for cuDNN convolution or "
-            "not, defalut is False.");
 
 namespace paddle {
 namespace operators {
@@ -43,25 +36,13 @@ using DataLayout = platform::DataLayout;
 template <typename T>
 using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
 
-static constexpr char kCUDNNFwdAlgoCache[] = "kCUDNNFwdAlgoCache";
-static constexpr char kCUDNNBwdDataAlgoCache[] = "kCUDNNBwdDataAlgoCache";
-static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache";
-
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
     static_cast<size_t>(1024) * 1024 * 1024;
 
-static constexpr size_t kNUM_CUDNN_FWD_ALGS =
-    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
-static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
-    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
-static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
-    CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
-
 template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "It must use CUDAPlace.");
     auto* input = ctx.Input<Tensor>("Input");
@@ -74,8 +55,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     int groups = ctx.Attr<int>("groups");
     int64_t user_workspace_size =
         static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
-    bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
 
     const T* input_data = input->data<T>();
     const T* filter_data = filter->data<T>();
@@ -141,18 +120,19 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv workspace ---------------------
     size_t workspace_size_in_bytes;  // final workspace to allocate.
     size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
-    if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
-      int64_t max_user_size =
-          std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
-                   user_workspace_size);
-      workspace_size_limit = max_user_size * 1024 * 1024;
+    if (user_workspace_size > 0) {
+      workspace_size_limit = user_workspace_size * 1024 * 1024;
     }
-
     // ------------------- cudnn conv algorithm ---------------------
     cudnnConvolutionFwdAlgo_t algo;
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
 
-    bool half_float = false;
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        workspace_size_limit, &algo));
+
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
     // Tensor core is supported since the volta GPU and
     // is only enabled when input and filter data are float16
@@ -163,65 +143,12 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
           cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
       // Currently tensor core is only enabled using this algo
       algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-      half_float = true;
     } else {
       CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
           cudnn_conv_desc, CUDNN_DEFAULT_MATH));
     }
 #endif
 
-    auto x_dims = framework::vectorize(input->dims());
-    auto f_dims = framework::vectorize(filter->dims());
-    if ((!exhaustive_search) && (!half_float)) {
-      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-          handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-          cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-          workspace_size_limit, &algo));
-      VLOG(3) << "cuDNN forward algo " << algo;
-    } else if (exhaustive_search && (!half_float)) {
-      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
-      if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
-        algo_cache =
-            ctx.scope()
-                .FindVar(kCUDNNFwdAlgoCache)
-                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
-      } else {
-        algo_cache =
-            const_cast<framework::Scope&>(ctx.scope())
-                .Var(kCUDNNFwdAlgoCache)
-                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
-      }
-      algo = algo_cache->GetAlgorithm(
-          x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
-            int returned_algo_count;
-            std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
-                fwd_perf_stat;
-            auto cudnn_find_func = [&](void* cudnn_workspace) {
-              CUDNN_ENFORCE(
-                  platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
-                      handle, cudnn_input_desc, input_data, cudnn_filter_desc,
-                      filter_data, cudnn_conv_desc, cudnn_output_desc,
-                      output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
-                      fwd_perf_stat.data(), cudnn_workspace,
-                      workspace_size_limit));
-            };
-            dev_ctx.RunCudnnFuncWithWorkspace(cudnn_find_func,
-                                              workspace_size_limit);
-
-            VLOG(3) << "Perf result: (algo: stat, time, memory)";
-            for (int i = 0; i < returned_algo_count; ++i) {
-              const auto& stat = fwd_perf_stat[i];
-              VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
-                      << " " << stat.memory;
-            }
-            return fwd_perf_stat[0].algo;
-          });
-      VLOG(3) << "choose algo " << algo;
-    } else {
-      PADDLE_ENFORCE(half_float,
-                     "cuDNN exhaustive search doesn't support half float.");
-    }
-
     // get workspace size able to allocate
     CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
@@ -251,7 +178,6 @@ template <typename T>
 class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "It must use CUDAPlace.");
     auto input = ctx.Input<Tensor>("Input");
@@ -270,13 +196,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     int groups = ctx.Attr<int>("groups");
     int64_t user_workspace_size =
         static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
-    bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
-    if (exhaustive_search && FLAGS_cudnn_deterministic) {
-      PADDLE_THROW(
-          "Cann't set exhaustive_search True and "
-          "FLAGS_cudnn_deterministic True at same time.");
-    }
 
     // ------------------- cudnn descriptors ---------------------
     ScopedTensorDescriptor input_desc;
@@ -344,65 +263,14 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     cudnnConvolutionBwdFilterAlgo_t filter_algo;
     size_t workspace_size_in_bytes = 0, tmp_size = 0;
     size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
-    if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
-      int64_t max_user_size =
-          std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
-                   user_workspace_size);
-      workspace_size_limit = max_user_size * 1024 * 1024;
+    if (user_workspace_size > 0) {
+      workspace_size_limit = user_workspace_size * 1024 * 1024;
     }
 
-    auto x_dims = framework::vectorize(input->dims());
-    auto f_dims = framework::vectorize(filter->dims());
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
-      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      if (exhaustive_search) {
-        AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>* data_algo_cache;
-        if (ctx.scope().FindVar(kCUDNNBwdDataAlgoCache)) {
-          data_algo_cache =
-              ctx.scope()
-                  .FindVar(kCUDNNBwdDataAlgoCache)
-                  ->GetMutable<
-                      AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
-        } else {
-          data_algo_cache =
-              const_cast<framework::Scope&>(ctx.scope())
-                  .Var(kCUDNNBwdDataAlgoCache)
-                  ->GetMutable<
-                      AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
-        }
-        data_algo = data_algo_cache->GetAlgorithm(
-            x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
-              int returned_algo_count;
-              std::array<cudnnConvolutionBwdDataAlgoPerf_t,
-                         kNUM_CUDNN_BWD_DATA_ALGS>
-                  data_perf_stat;
-              auto cudnn_find_func = [&](void* cudnn_workspace) {
-                CUDNN_ENFORCE(
-                    platform::dynload::
-                        cudnnFindConvolutionBackwardDataAlgorithmEx(
-                            handle, cudnn_filter_desc, filter_data,
-                            cudnn_output_grad_desc, output_grad_data,
-                            cudnn_conv_desc, cudnn_input_desc, input_grad_data,
-                            kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
-                            data_perf_stat.data(), cudnn_workspace,
-                            workspace_size_limit));
-              };
-              dev_ctx.RunCudnnFuncWithWorkspace(cudnn_find_func,
-                                                workspace_size_limit);
-
-              VLOG(3) << "Perf result: (algo: stat, time, memory)";
-              for (int i = 0; i < returned_algo_count; ++i) {
-                const auto& stat = data_perf_stat[i];
-                VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
-                        << " " << stat.memory;
-              }
-              return data_perf_stat[0].algo;
-            });
-        VLOG(3) << "cuDNN backward data algo " << data_algo;
-      } else if (FLAGS_cudnn_deterministic) {
-        data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
-      } else {
+      if (!FLAGS_cudnn_deterministic) {
         CUDNN_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
                 handle, cudnn_filter_desc,
@@ -415,7 +283,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                 cudnn_input_desc,
                 CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
                 workspace_size_limit, &data_algo));
+      } else {
+        data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
       }
+
       CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
               handle, cudnn_filter_desc, cudnn_output_grad_desc,
@@ -424,54 +295,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     }
 
     if (filter_grad) {
-      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-      if (exhaustive_search) {
-        AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>* f_algo_cache;
-        if (ctx.scope().FindVar(kCUDNNBwdFilterAlgoCache)) {
-          f_algo_cache =
-              ctx.scope()
-                  .FindVar(kCUDNNBwdFilterAlgoCache)
-                  ->GetMutable<
-                      AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
-        } else {
-          f_algo_cache =
-              const_cast<framework::Scope&>(ctx.scope())
-                  .Var(kCUDNNBwdFilterAlgoCache)
-                  ->GetMutable<
-                      AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
-        }
-        filter_algo = f_algo_cache->GetAlgorithm(
-            x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
-              int returned_algo_count;
-              std::array<cudnnConvolutionBwdFilterAlgoPerf_t,
-                         kNUM_CUDNN_BWD_FILTER_ALGS>
-                  filter_perf_stat;
-              auto cudnn_find_f_func = [&](void* cudnn_workspace) {
-                CUDNN_ENFORCE(
-                    platform::dynload::
-                        cudnnFindConvolutionBackwardFilterAlgorithmEx(
-                            handle, cudnn_input_desc, input_data,
-                            cudnn_output_grad_desc, output_grad_data,
-                            cudnn_conv_desc, cudnn_filter_desc,
-                            filter_grad_data, kNUM_CUDNN_BWD_FILTER_ALGS,
-                            &returned_algo_count, filter_perf_stat.data(),
-                            cudnn_workspace, workspace_size_limit));
-              };
-              dev_ctx.RunCudnnFuncWithWorkspace(cudnn_find_f_func,
-                                                workspace_size_limit);
-              return filter_perf_stat[0].algo;
-            });
-        VLOG(3) << "cuDNN backward filter algo " << filter_algo;
-      } else if (FLAGS_cudnn_deterministic) {
-        filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
-      } else {
+      if (!FLAGS_cudnn_deterministic) {
         CUDNN_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
                 handle, cudnn_input_desc, cudnn_output_grad_desc,
                 cudnn_conv_desc, cudnn_filter_desc,
                 CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
                 workspace_size_limit, &filter_algo));
+      } else {
+        filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
       }
+
       CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
               handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,