cleard. staged

PaddlePaddle · dzhwinter · Nov 8, 2018 · Aug 17, 2018 · Aug 17, 2018 · Aug 17, 2018
commit bf2e4cb1882b077b9efa78626f30965e3f15a2ab
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
@@ -173,6 +173,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 if (NOT WIN32) # windows msvc2015 support c++11 natively. 
 # -std=c++11 -fPIC not recoginize by msvc
 list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
+# in cuda9, suppress cuda warning on eigen with "-w"
 list(APPEND CUDA_NVCC_FLAGS "-w" "-Xcompiler -fPIC")
 else(NOT WIN32)
 list(APPEND CUDA_NVCC_FLAGS "-w" "-Xcompiler -fPIC" "-Xcompiler /w")
@@ -181,7 +182,7 @@ endif(NOT WIN32)
 if(WITH_FAST_MATH)
   # Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
-# in cuda9, suppress cuda warning on eigen 
+endif(WITH_FAST_MATH)
 
 # Set :expt-relaxed-constexpr to suppress Eigen warnings
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")

diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake
@@ -3,6 +3,7 @@ INCLUDE(ExternalProject)
 SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool)
 SET(THREADPOOL_INCLUDE_DIR ${THREADPOOL_SOURCE_DIR}/src/extern_threadpool)
 INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR})
+message("Debug" ${THREADPOOL_INCLUDE_DIR})
 
 ExternalProject_Add(
     extern_threadpool

diff --git a/cmake/flags.cmake b/cmake/flags.cmake
@@ -143,26 +143,14 @@ set(GPU_COMMON_FLAGS
     -Wno-error=unused-function  # Warnings in Numpy Header.
     -Wno-error=array-bounds # Warnings in Eigen::array
 )
-set(COMMON_FLAGS 
-    -fPIC
-    -fno-omit-frame-pointer)
-set(GPU_COMMON_FLAGS
-    -fPIC
-    -fno-omit-frame-pointer)
-
-else(NOT WIN32)
-set(COMMON_FLAGS
-    "/w") #disable all warnings.
-
-set(GPU_COMMON_FLAGS
-    "/w") #disable all warnings
-
-endif(NOT WIN32)
-
 else(NOT WIN32)
 set(COMMON_FLAGS
+    -fPIC
+    -fno-omit-frame-pointer
     "/w") #disable all warnings.
 set(GPU_COMMON_FLAGS
+    -fPIC
+    -fno-omit-frame-pointer
     "/w") #disable all warnings
 endif(NOT WIN32)
 

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
@@ -48,6 +48,7 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
   VLOG(5) << "destroy ExecutorPrepareContext";
 }
 
+#ifndef _WIN32
 template <typename RefCntMap>
 static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
                                 GarbageCollector<Tensor>* gc,
@@ -82,6 +83,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
     gc->Add(erase_tensors);
   }
 }
+#endif
 
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
@@ -331,97 +333,35 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
 
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
     const ProgramDesc& program, int block_id) {
-  VLOG(3) << "before create prepare" << block_id << " " << program.Size();
   std::unique_ptr<ExecutorPrepareContext> ctx(
       new ExecutorPrepareContext(program, block_id));
-  VLOG(3) << "after create prepare";
-  // PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
-  VLOG(3) << "before create op_desc";
+  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
   auto& block = program.Block(block_id);
-  VLOG(3) << "create before" << ctx->ops_.size() << " "
-          << block.AllOps().size();
   int counter = 0;
   for (auto& op_desc : block.AllOps()) {
     ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
-    VLOG(3) << "create op "
-            << "index " << ++counter << " type " << op_desc->Type();
   }
-  VLOG(3) << "create finished" << ctx->ops_.size() << " "
-          << block.AllOps().size();
   return ctx;
 }
 
 std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
     const ProgramDesc& program, const std::vector<int>& block_ids) {
-  VLOG(3) << "inside prepare";
   std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
-  VLOG(3) << "before go through block_ids";
   for (auto& bid : block_ids) {
-    VLOG(3) << "block id" << bid;
     auto* ctx = new ExecutorPrepareContext(program, bid);
-    // PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
+    PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
     auto& block = program.Block(bid);
-    int counter = 0;
-    VLOG(3) << "create before" << ctx->ops_.size() << " "
-            << block.AllOps().size();
     for (auto& op_desc : block.AllOps()) {
       ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
-      VLOG(3) << "create op "
-              << "index " << ++counter << " type " << op_desc->Type();
     }
-    VLOG(3) << "create finished" << ctx->ops_.size() << " "
-            << block.AllOps().size();
     result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
   }
   return result;
 }
 
-// void CheckResult(const std::string op_type, ExecutorPrepareContext* ctx,
-// Scope* local_scope) {
-//     VLOG(3) << "before checking result";
-//   auto& dev_ctx = *platform::DeviceContextPool::Instance().Get(place_);
-//   std::vector<std::string> outputs;
-//   auto& block = ctx->prog_.Block(0);
-//   bool found = false;
-//   framework::OpDesc* myop = nullptr;
-//   for(auto& op : block.AllOps()) {
-//     if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() ==
-//     "feed") return;
-//     if (op->Type() == op_type) {
-//         found = true;
-//         myop = op;
-//         break;
-//       }
-//     }
-//   }
-//   if(!found) {
-//     VLOG(3) << "not found op!";
-//     return;
-//   }
-//     auto* op = myop;
-//      VLOG(3) << "start op output" << op->Type();
-//     for(auto var_name: op->OutputArgumentNames()) {
-//       auto* var = local_scope->Var(var_name);
-//       auto* var_desc = block.FindVar(var_name);
-//       if (var_desc->Persistable()) continue;
-//       auto* tensor = var->GetMutable<framework::LoDTensor>();
-//       framework::Tensor check;
-//       VLOG(3) << "before tensor copy";
-//       framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
-//       VLOG(3) << "after tensor copy";
-//       float sum = .0;
-//       for(size_t i=0; i < check.numel(); ++i) {
-//           sum += check.data<float>()[i];
-//       }
-//       VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum "
-//       << sum;
-//   VLOG(3) << "after checking result";
-// }
-
 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                                   bool create_local_scope, bool create_vars,
                                   bool keep_kids) {
-  VLOG(3) << "RunPreparedContext inside";
   Scope* local_scope = scope;
   if (create_vars) {
     if (create_local_scope) {
@@ -430,6 +370,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
     CreateVariables(ctx->prog_, local_scope, ctx->block_id_);
   }
 
+#ifndef _WIN32
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector<Tensor>> gc;
   // WhileOp would set keep_kids to false
@@ -471,6 +412,16 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   } else {
     platform::DeviceContextPool::Instance().Get(place_)->Wait();
   }
+#else   // WIN32
+  for (auto& op : ctx->ops_) {
+    op->Run(*local_scope, place_);
+    if (FLAGS_benchmark) {
+      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
+              << memory::memory_usage(place_);
+    }
+  }
+  platform::DeviceContextPool::Instance().Get(place_)->Wait();
+#endif  // NOT WIN32
 
   if (local_scope != scope) {
     scope->DeleteScope(local_scope);

diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
@@ -17,12 +17,14 @@ limitations under the License. */
 #include <map>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
+#ifndef _WIN32
+#include "paddle/fluid/framework/garbage_collector.h"
+#endif
 
 namespace paddle {
 namespace framework {

diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
@@ -35,9 +35,10 @@ endif()
 
 # Create static library
 if (WIN32)
-cc_library(paddle_fluid DEPS ${fluid_modules} ${fluid_thirdpa} paddle_fluid_api paddle_inference_api)
+cc_library(paddle_fluid DEPS ${fluid_modules} ${fluid_third_partys} paddle_fluid_api paddle_inference_api)
 else(WIND32)
 cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
+endif(WIN32)
 
 if(NOT APPLE)
   # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.

diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
@@ -51,6 +51,7 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)
 
 cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
+cc_library(helper SRCS helper.cc DEPS reset_tensor_array lod_tensor scope)
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope)
 cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
 cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)

diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
@@ -16,7 +16,6 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle_inference_api.h"
 
 namespace paddle {
 

diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
@@ -260,9 +260,8 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   if (config.use_gpu) {
     // 1. GPU memeroy
     PADDLE_ENFORCE_GT(
-       config.fraction_of_gpu_memory, 0.f,
-       "fraction_of_gpu_memory in the config should be set to range (0.,
-       1.]");
+        config.fraction_of_gpu_memory, 0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0.,1.]");
     PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
     std::vector<std::string> flags;
     if (config.fraction_of_gpu_memory >= 0.0f ||

diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
@@ -31,10 +31,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle_inference_api.h"  // NOLINT
 
 namespace paddle {
 

diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
@@ -14,18 +14,17 @@
 
 #pragma once
 
+#define GLOG_NO_ABBREVIATED_SEVERITIES
+#define GOOGLE_GLOG_DLL_DECL
 #include <glog/logging.h>
-
 #include <algorithm>
 #include <chrono>  // NOLINT
 #include <iterator>
 #include <numeric>
 #include <sstream>
 #include <string>
 #include <vector>
-#include "paddle/fluid/string/printf.h"
-#include "paddle_inference_api.h"
-#include "timer.h"
+#include "paddle_inference_api.h"  //NOLINT
 
 namespace paddle {
 namespace inference {
@@ -97,7 +96,7 @@ static void TensorAssignData(PaddleTensor *tensor,
 }
 
 template <typename T>
-static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
+static int ZeroCopyTensorAssignData(paddle::ZeroCopyTensor *tensor,
                                     const std::vector<std::vector<T>> &data) {
   int size{0};
   auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
@@ -284,12 +284,10 @@ op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
 op_library(max_sequence_len_op DEPS lod_rank_table)
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
-if (NOT WIN32)
-    op_library(lstm_op DEPS sequence2batch lstm_compute)
-    op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
-    op_library(lstmp_op DEPS sequence2batch lstm_compute)
-    op_library(gru_op DEPS sequence2batch gru_compute)
-endif(NOT WIN32)
+op_library(lstm_op DEPS sequence2batch lstm_compute)
+op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
+op_library(lstmp_op DEPS sequence2batch lstm_compute)
+op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)

diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -31,12 +31,12 @@ namespace operators {
 
 template <typename T>
 __device__ bool GT_E(T a, T b) {
-  return (a > b) || fabs(a - b) < 1e-4;
+  return (a > b) || fabsf(static_cast<float>(a - b)) < 1e-4;
 }
 
 template <typename T>
 __device__ bool LT_E(T a, T b) {
-  return (a < b) || fabs(a - b) < 1e-4;
+  return (a < b) || fabsf(static_cast<float>(a - b)) < 1e-4;
 }
 
 template <typename T>

diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
@@ -57,9 +57,6 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
-if (NOT WIN32)
-    math_library(matrix_bit_code)
-endif (NOT WIN32)
 math_library(unpooling)
 math_library(vol2col)
 
@@ -75,7 +72,10 @@ if(WITH_GPU)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
-cc_library(jit_kernel 
-    SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc
-    DEPS cpu_info cblas)
-cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
+if (NOT WIN32)
+    math_library(matrix_bit_code)
+    cc_library(jit_kernel
+        SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc
+        DEPS cpu_info cblas)
+    cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
+endif (NOT WIN32)
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
@@ -235,7 +235,9 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
             << ", Runtime Version: " << runtime_version_ / 1000 << "."
             << (runtime_version_ % 100) / 10;
 
+#ifndef _WIN32
   callback_manager_.reset(new StreamCallbackManager(stream_));
+#endif  // NOT WIN32
 }
 
 CUDADeviceContext::~CUDADeviceContext() {