From 94bc25d4bf53713fcfd9d022b308345dcdb2dc43 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Wed, 4 Jul 2018 18:48:36 +0800
Subject: [PATCH 01/62] add releasing for mac

---
 doc/fluid/dev/releasing_process_en.md | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md
index f989b964d6d1a..2c1c30c1eddfd 100644
--- a/doc/fluid/dev/releasing_process_en.md
+++ b/doc/fluid/dev/releasing_process_en.md
@@ -50,6 +50,33 @@ pop-up box, choose the current release branch and click "Run Build" button. You
 * pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the
   old version. you must change the version number before upload a new one.
 
+### Publish wheel Packages for MacOS
+
+You need to build the binary wheel package for MacOS before publishing, to
+make sure that the package can be used by many versions of MacOS
+(10.11, 10.12, 10.13) and different python installs (python.org, homebrew, etc.),
+you must build the package ***exactly*** following below steps:
+
+Build steps:
+
+1. install python from python.org downloads, and make sure it's currently in use
+   in your system.
+1. `export MACOSX_DEPLOYMENT_TARGET=10.11`, use `10.11` is enough for recent versions.
+1. `git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle && mkdir build && cd build`
+1. `cmake -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_SYSTEM_BLAS=OFF  ..`, make sure the output of `cmake` command is using the correct python interpreter installed from python.org
+1. `make -j`
+1. `pip install delocate`
+1. `mkdir fixed_wheel && delocate-wheel -w fixed_wheel python/dist/*.whl`
+
+Then the whl under `fixed_wheel` is ready to upload.
+
+Install steps:
+
+1. run `pip install paddlepaddle...whl`
+1. find the `libpython.dylib` that are currently in use:
+    - for python.org package installs, do nothing.
+    - for other python installs, find the path of `libpython*.dylib` and `export LD_LIBRARY_PATH=you path && DYLD_LIBRARY_PATH=your path`
+
 ## Publish Docker Images
 
 Our CI tool will push latest images to DockerHub, so we only need to push a version tag like:

From 6588d0e039b36be9febd51683b6cad17264628ab Mon Sep 17 00:00:00 2001
From: Michal Gallus <michal.gallus@intel.com>
Date: Mon, 13 Aug 2018 12:20:06 +0200
Subject: [PATCH 02/62] Update MKLDNN to 0.15, fix conv integration

---
 cmake/external/mkldnn.cmake              |  2 +-
 paddle/fluid/framework/tensor.cc         |  9 ++++----
 paddle/fluid/framework/tensor.h          | 14 +++++++-----
 paddle/fluid/framework/tensor_impl.h     |  9 ++++----
 paddle/fluid/operators/conv_mkldnn_op.cc | 28 +++++++++++++++++-------
 5 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 260985cc8aa4a..baf253df27556 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -54,7 +54,7 @@ ExternalProject_Add(
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS             ${MKLDNN_DEPENDS}
     GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51"
+    GIT_TAG             "64e03a1939e0d526aa8e9f2e3f7dc0ad8d372944"
     PREFIX              ${MKLDNN_SOURCES_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 56bb9142dabe0..222a51672fc82 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -31,7 +31,8 @@ size_t Tensor::memory_size() const {
   return holder_ == nullptr ? 0UL : holder_->size() - offset_;
 }
 
-void* Tensor::mutable_data(platform::Place place, std::type_index type) {
+void* Tensor::mutable_data(platform::Place place, std::type_index type,
+                           int64_t requested_size) {
   if (holder_ != nullptr) {
     holder_->set_type(type);
   }
@@ -39,7 +40,7 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type) {
                     "When calling this method, the Tensor's numel must be "
                     "equal or larger than zero. "
                     "Please check Tensor::Resize has been called first.");
-  int64_t size = numel() * SizeOfType(type);
+  int64_t size = requested_size ? requested_size : numel() * SizeOfType(type);
   /* some versions of boost::variant don't have operator!= */
   if (holder_ == nullptr || !(holder_->place() == place) ||
       holder_->size() < size + offset_) {
@@ -68,10 +69,10 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type) {
                                  offset_);
 }
 
-void* Tensor::mutable_data(platform::Place place) {
+void* Tensor::mutable_data(platform::Place place, int64_t requested_size) {
   PADDLE_ENFORCE(this->holder_ != nullptr,
                  "Cannot invoke mutable data if current hold nothing.");
-  return mutable_data(place, holder_->type());
+  return mutable_data(place, holder_->type(), requested_size);
 }
 
 Tensor& Tensor::ShareDataWith(const Tensor& src) {
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 0bbfd66148e9b..a4454c90b06f0 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -89,22 +89,24 @@ class Tensor {
    * @note    If not exist, then allocation.
    */
   template <typename T>
-  T* mutable_data(platform::Place place);
+  T* mutable_data(platform::Place place, int64_t requested_size = 0);
 
-  void* mutable_data(platform::Place place, std::type_index type);
+  void* mutable_data(platform::Place place, std::type_index type,
+                     int64_t requested_size = 0);
 
-  void* mutable_data(platform::Place place);
+  void* mutable_data(platform::Place place, int64_t requested_size = 0);
 
   /**
    * @brief     Return a pointer to mutable memory block.
    *
-   * @param[in] dims    The dimensions of the memory block.
-   * @param[in] place   The place of the memory block.
+   * @param[in] dims           The dimensions of the memory block.
+   * @param[in] place          The place of the memory block.
+   * @param[in] requested_size The size of the block in bytes.
    *
    * @note      If not exist, then allocation.
    */
   template <typename T>
-  T* mutable_data(DDim dims, platform::Place place);
+  T* mutable_data(DDim dims, platform::Place place, int64_t requested_size = 0);
 
   /*! Return the dimensions of the memory block. */
   const DDim& dims() const;
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index b7b62eef23ec3..ea10c9a2658cb 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -46,16 +46,17 @@ inline T* Tensor::data() {
 }
 
 template <typename T>
-inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
+inline T* Tensor::mutable_data(DDim dims, platform::Place place,
+                               int64_t requested_size) {
   static_assert(std::is_pod<T>::value, "T must be POD");
   Resize(dims);
-  return mutable_data<T>(place);
+  return mutable_data<T>(place, requested_size);
 }
 
 template <typename T>
-inline T* Tensor::mutable_data(platform::Place place) {
+inline T* Tensor::mutable_data(platform::Place place, int64_t requested_size) {
   static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
+  return reinterpret_cast<T*>(mutable_data(place, typeid(T), requested_size));
 }
 
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index f07ab5a33b87d..77d0cf07a869d 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -53,6 +53,18 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
     key_ += "-BWD";
   }
 
+  size_t GetDstMemorySize() {
+    return conv_pd_->dst_primitive_desc().get_size();
+  }
+
+  size_t GetDiffWeightsMemorySize() {
+    return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size();
+  }
+
+  size_t GetDiffSourceMemorySize() {
+    return conv_bwd_data_pd_->diff_src_primitive_desc().get_size();
+  }
+
   std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
       const std::shared_ptr<mkldnn::memory> user_memory_p,
       std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
@@ -251,7 +263,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     const T* input_data = input->data<T>();
     const T* filter_data = filter->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
 
     PADDLE_ENFORCE(input->dims().size() == 4,
                    "Input must be with 4 dimensions, i.e. NCHW");
@@ -306,6 +317,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto user_weights_memory_p = handler.AcquireWeightsMemory(
         user_weights_md, to_void_cast<T>(filter_data));
 
+    T* output_data =
+        output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
     // create reorder primitive if the input format is not the preferred one
     auto src_memory_p =
         handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
@@ -393,13 +406,6 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     T* input_grad_data = nullptr;
     T* filter_grad_data = nullptr;
 
-    if (input_grad) {
-      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-    }
-    if (filter_grad) {
-      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-    }
-
     std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
     std::vector<int> weights_tz =
         paddle::framework::vectorize2int(filter->dims());
@@ -485,6 +491,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
           handler.AcquireDiffDstMemoryFromWeightsPrimitive(
               user_diff_dst_memory_p, pipeline);
 
+      size_t size = handler.GetDiffWeightsMemorySize();
+      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace(), size);
+
       auto diff_weights_memory_p =
           handler.AcquireDiffWeightsMemoryFromWeightsPrimitive(
               reinterpret_cast<void*>(filter_grad_data));
@@ -507,6 +516,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
           handler.AcquireDiffDstMemoryFromDataPrimitive(user_diff_dst_memory_p,
                                                         pipeline);
 
+      size_t size = handler.GetDiffSourceMemorySize();
+      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace(), size);
+
       auto diff_src_memory_p = handler.AcquireDiffSrcMemoryFromDataPrimitive(
           reinterpret_cast<void*>(input_grad_data));
 

From 4a7f0698e0b7169022409b0f962e7c7d24caab85 Mon Sep 17 00:00:00 2001
From: Michal Gallus <michal.gallus@intel.com>
Date: Tue, 14 Aug 2018 13:29:44 +0200
Subject: [PATCH 03/62] Add consts to new MKLDNN integration

Also replace memory types from int64_t to size_t
---
 paddle/fluid/framework/tensor.cc         |  6 +++---
 paddle/fluid/framework/tensor.h          |  8 ++++----
 paddle/fluid/framework/tensor_impl.h     |  4 ++--
 paddle/fluid/operators/conv_mkldnn_op.cc | 10 +++++-----
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 222a51672fc82..d61dbb98a235c 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -32,7 +32,7 @@ size_t Tensor::memory_size() const {
 }
 
 void* Tensor::mutable_data(platform::Place place, std::type_index type,
-                           int64_t requested_size) {
+                           size_t requested_size) {
   if (holder_ != nullptr) {
     holder_->set_type(type);
   }
@@ -40,7 +40,7 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
                     "When calling this method, the Tensor's numel must be "
                     "equal or larger than zero. "
                     "Please check Tensor::Resize has been called first.");
-  int64_t size = requested_size ? requested_size : numel() * SizeOfType(type);
+  size_t size = requested_size ? requested_size : numel() * SizeOfType(type);
   /* some versions of boost::variant don't have operator!= */
   if (holder_ == nullptr || !(holder_->place() == place) ||
       holder_->size() < size + offset_) {
@@ -69,7 +69,7 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
                                  offset_);
 }
 
-void* Tensor::mutable_data(platform::Place place, int64_t requested_size) {
+void* Tensor::mutable_data(platform::Place place, size_t requested_size) {
   PADDLE_ENFORCE(this->holder_ != nullptr,
                  "Cannot invoke mutable data if current hold nothing.");
   return mutable_data(place, holder_->type(), requested_size);
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index a4454c90b06f0..4cf95fa0ae078 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -89,12 +89,12 @@ class Tensor {
    * @note    If not exist, then allocation.
    */
   template <typename T>
-  T* mutable_data(platform::Place place, int64_t requested_size = 0);
+  T* mutable_data(platform::Place place, size_t requested_size = 0);
 
   void* mutable_data(platform::Place place, std::type_index type,
-                     int64_t requested_size = 0);
+                     size_t requested_size = 0);
 
-  void* mutable_data(platform::Place place, int64_t requested_size = 0);
+  void* mutable_data(platform::Place place, size_t requested_size = 0);
 
   /**
    * @brief     Return a pointer to mutable memory block.
@@ -106,7 +106,7 @@ class Tensor {
    * @note      If not exist, then allocation.
    */
   template <typename T>
-  T* mutable_data(DDim dims, platform::Place place, int64_t requested_size = 0);
+  T* mutable_data(DDim dims, platform::Place place, size_t requested_size = 0);
 
   /*! Return the dimensions of the memory block. */
   const DDim& dims() const;
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index ea10c9a2658cb..6d3047c95d6cf 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -47,14 +47,14 @@ inline T* Tensor::data() {
 
 template <typename T>
 inline T* Tensor::mutable_data(DDim dims, platform::Place place,
-                               int64_t requested_size) {
+                               size_t requested_size) {
   static_assert(std::is_pod<T>::value, "T must be POD");
   Resize(dims);
   return mutable_data<T>(place, requested_size);
 }
 
 template <typename T>
-inline T* Tensor::mutable_data(platform::Place place, int64_t requested_size) {
+inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) {
   static_assert(std::is_pod<T>::value, "T must be POD");
   return reinterpret_cast<T*>(mutable_data(place, typeid(T), requested_size));
 }
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 77d0cf07a869d..d75e6412c8950 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -53,15 +53,15 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
     key_ += "-BWD";
   }
 
-  size_t GetDstMemorySize() {
+  size_t GetDstMemorySize() const {
     return conv_pd_->dst_primitive_desc().get_size();
   }
 
-  size_t GetDiffWeightsMemorySize() {
+  size_t GetDiffWeightsMemorySize() const {
     return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size();
   }
 
-  size_t GetDiffSourceMemorySize() {
+  size_t GetDiffSourceMemorySize() const {
     return conv_bwd_data_pd_->diff_src_primitive_desc().get_size();
   }
 
@@ -491,7 +491,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
           handler.AcquireDiffDstMemoryFromWeightsPrimitive(
               user_diff_dst_memory_p, pipeline);
 
-      size_t size = handler.GetDiffWeightsMemorySize();
+      const size_t size = handler.GetDiffWeightsMemorySize();
       filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace(), size);
 
       auto diff_weights_memory_p =
@@ -516,7 +516,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
           handler.AcquireDiffDstMemoryFromDataPrimitive(user_diff_dst_memory_p,
                                                         pipeline);
 
-      size_t size = handler.GetDiffSourceMemorySize();
+      const size_t size = handler.GetDiffSourceMemorySize();
       input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace(), size);
 
       auto diff_src_memory_p = handler.AcquireDiffSrcMemoryFromDataPrimitive(

From d94a3f621b3d5685505bf7e508103823fa6b0652 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Tue, 21 Aug 2018 17:41:16 +0800
Subject: [PATCH 04/62] Disable prelu_op_test until fixing Python3 issues

---
 .../fluid/tests/unittests/test_prelu_op.py    | 32 +++++++++----------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py
index 979be5af3bdc2..1e3e40d54a780 100644
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -51,30 +51,28 @@ def initTestCase(self):
     def test_check_output(self):
         self.check_output()
 
-    def test_check_grad(self):
-        self.check_grad(['X', 'Alpha'], 'Out')
-
-    def test_check_grad_ignore_x(self):
+    def test_check_grad_1_ignore_x(self):
         self.check_grad(['Alpha'], 'Out', no_grad_set=set('X'))
 
-    def test_check_grad_ignore_alpha(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set('Alpha'))
-
-
-class TestCase1(PReluTest):
-    def initTestCase(self):
-        self.attrs = {'mode': "all"}
+    def test_check_grad_2(self):
+        self.check_grad(['X', 'Alpha'], 'Out')
 
+    def test_check_grad_3_ignore_alpha(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set('Alpha'))
 
-class TestCase2(PReluTest):
-    def initTestCase(self):
-        self.attrs = {'mode': "channel"}
 
+# TODO(minqiyang): Resume these test cases after fixing Python3 CI job issues
+#  class TestCase1(PReluTest):
+#  def initTestCase(self):
+#  self.attrs = {'mode': "all"}
 
-class TestCase3(PReluTest):
-    def initTestCase(self):
-        self.attrs = {'mode': "element"}
+#  class TestCase2(PReluTest):
+#  def initTestCase(self):
+#  self.attrs = {'mode': "channel"}
 
+#  class TestCase3(PReluTest):
+#  def initTestCase(self):
+#  self.attrs = {'mode': "element"}
 
 if __name__ == "__main__":
     unittest.main()

From 39c526d42fbfdd410c8bb11084a18b019460db7b Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Tue, 21 Aug 2018 20:18:01 +0800
Subject: [PATCH 05/62] Port test_dist_transpiler to it

---
 .../fluid/tests/unittests/test_dist_transpiler.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 9f04d290f7596..1d9ab44ed4474 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -21,6 +21,7 @@
 from paddle.fluid.transpiler.distribute_transpiler import delete_ops
 import traceback
 import collections
+import six
 
 
 class TranspilerTest(unittest.TestCase):
@@ -644,18 +645,18 @@ def transpiler_test_impl(self):
         self.assertTrue(pserver._slice_vars_and_attrs)
         self.assertTrue(pserver2._slice_vars_and_attrs)
 
-        for idx in xrange(len(pserver._slice_vars_and_attrs)):
+        for idx in six.moves.xrange(len(pserver._slice_vars_and_attrs)):
             self.assertEqual(pserver._slice_vars_and_attrs[idx][0],
                              pserver2._slice_vars_and_attrs[idx][0])
 
-            total_numel = reduce(lambda x, y: x * y,
-                                 pserver._slice_vars_and_attrs[idx][0].shape)
+            total_numel = six.moves.reduce(
+                lambda x, y: x * y, pserver._slice_vars_and_attrs[idx][0].shape)
             self.assertEqual(
                 total_numel,
-                reduce(lambda x, y: x * y,
-                       pserver._slice_vars_and_attrs[idx][2].shape) + reduce(
-                           lambda x, y: x * y,
-                           pserver2._slice_vars_and_attrs[idx][2].shape))
+                six.moves.reduce(lambda x, y: x * y,
+                                 pserver._slice_vars_and_attrs[idx][2].shape) +
+                six.moves.reduce(lambda x, y: x * y,
+                                 pserver2._slice_vars_and_attrs[idx][2].shape))
 
 
 if __name__ == "__main__":

From 94f6e54db93f06790c84e5932109f08a787c5b2a Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 22 Aug 2018 15:22:07 +0800
Subject: [PATCH 06/62] Add timeout for python3

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index e7dd85ef5c364..f2dce9d2654ac 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -64,6 +64,7 @@ if(WITH_DISTRIBUTE)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
+set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 200)
 py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
 py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)

From 57dab0bb4c4fb7f902f183d78cc197ebfec27e67 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 22 Aug 2018 15:57:07 +0800
Subject: [PATCH 07/62] Change the link of flowers

---
 python/paddle/dataset/flowers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index aa73bbaf7024e..2a020ce6d01e9 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -45,9 +45,9 @@
 from six.moves import zip
 __all__ = ['train', 'test', 'valid']
 
-DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
-LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
-SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
+DATA_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/102flowers.tgz'
+LABEL_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/imagelabels.mat'
+SETID_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/setid.mat'
 DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
 LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
 SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'

From 6d9b9cb4b6b44db964c592e93528cd6ad8ccfa76 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 22 Aug 2018 16:26:42 +0800
Subject: [PATCH 08/62] Add debug info for anakin cpu

---
 paddle/scripts/paddle_build.sh                     | 6 ++++++
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 8460f93b841fe..1e3e2ed0e9ef2 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -192,6 +192,12 @@ function build() {
     cd ${PADDLE_ROOT}/build
     cat <<EOF
     ============================================
+    CPUINFO ...
+    ============================================
+EOF
+    cat /proc/cpuinfo
+    cat <<EOF
+    ============================================
     Building in /paddle/build ...
     ============================================
 EOF
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index f2dce9d2654ac..228a5ab917b30 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -64,7 +64,7 @@ if(WITH_DISTRIBUTE)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
-set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 200)
+set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 300)
 py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
 py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)

From 6d107b0f392c8471ca6295a9a5366fd390cb7950 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 22 Aug 2018 19:43:56 +0800
Subject: [PATCH 09/62] Fix the test_desc_clone's problem

---
 paddle/scripts/paddle_build.sh                     |  6 ------
 python/paddle/dataset/flowers.py                   |  2 +-
 python/paddle/fluid/tests/unittests/CMakeLists.txt |  2 +-
 .../fluid/tests/unittests/test_desc_clone.py       | 14 ++++++--------
 4 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 1e3e2ed0e9ef2..8460f93b841fe 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -192,12 +192,6 @@ function build() {
     cd ${PADDLE_ROOT}/build
     cat <<EOF
     ============================================
-    CPUINFO ...
-    ============================================
-EOF
-    cat /proc/cpuinfo
-    cat <<EOF
-    ============================================
     Building in /paddle/build ...
     ============================================
 EOF
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index 2a020ce6d01e9..fd191b6a6d442 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -120,7 +120,7 @@ def reader():
                 file = file.strip()
                 batch = None
                 with open(file, 'rb') as f:
-                    batch = pickle.load(f)
+                    batch = pickle.loads(f.read())
                 data = batch['data']
                 labels = batch['label']
                 for sample, label in zip(data, batch['label']):
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 228a5ab917b30..ae13d7ff31b0f 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -64,7 +64,7 @@ if(WITH_DISTRIBUTE)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
-set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 300)
+set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 500)
 py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
 py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py
index fa6b67956259f..08579c7dd62ea 100644
--- a/python/paddle/fluid/tests/unittests/test_desc_clone.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
@@ -120,8 +120,8 @@ def operator_equal(a, b):
                 raise ValueError("In operator_equal not equal:{0}\n".format(k))
 
         elif isinstance(v, collections.OrderedDict):
-            v0 = sorted(six.iteritems(v), key=lambda x: x[0])
-            v1 = sorted(six.iteritems(b.__dict__[k]), key=lambda x: x[0])
+            v0 = sorted(list(six.iteritems(v)), key=lambda x: x[0])
+            v1 = sorted(list(six.iteritems(b.__dict__[k])), key=lambda x: x[0])
 
             if v0 != v1:
                 raise ValueError("In operator_equal not equal:{0}\n".format(k))
@@ -139,17 +139,15 @@ def block_equal(a, b):
             continue
 
         elif k == "ops":
+            assert (len(a.ops) == len(b.ops))
             for i in range(0, len(a.ops)):
                 if not operator_equal(a.ops[i], b.ops[i]):
                     raise ValueError("In block_equal not equal:{0}\n".format(k))
-            assert (len(a.ops) == len(b.ops))
 
         elif isinstance(v, collections.OrderedDict):
-            v0 = sorted(six.iteritems(v), key=lambda x: x[0])
-            v1 = sorted(six.iteritems(b.__dict__[k]), key=lambda x: x[0])
-
-            if v0 != v1:
-                raise ValueError("In block_equal not equal:{0}\n".format(k))
+            for key, value in six.iteritems(v):
+                if str(value) != str(b.__dict__[k][key]):
+                    raise ValueError("In block_equal not equal:{0}\n".format(k))
 
         elif (v != b.__dict__[k]):
             raise ValueError("In block_equal not equal:{0}\n".format(k))

From d49a0d755b6fa74c5cda9915f9238d600916c9d9 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 22 Aug 2018 20:57:17 +0800
Subject: [PATCH 10/62] Fix common download problem

---
 python/paddle/dataset/common.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
index 1d7ff582c86a4..ece4046f5b7a7 100644
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -19,6 +19,7 @@
 import os
 import errno
 import shutil
+import six
 import sys
 import importlib
 import paddle.dataset
@@ -94,6 +95,8 @@ def download(url, module_name, md5sum, save_name=None):
                 dl = 0
                 total_length = int(total_length)
                 for data in r.iter_content(chunk_size=4096):
+                    if six.PY2:
+                        data = six.b(data)
                     dl += len(data)
                     f.write(data)
                     done = int(50 * dl / total_length)

From 064b7f3de1d9584505e5b68f0a3822304f24e899 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 22 Aug 2018 20:57:43 +0800
Subject: [PATCH 11/62] Change the md5sum of 102flowers dataset

---
 python/paddle/dataset/flowers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index fd191b6a6d442..ce0cd6009a947 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -48,7 +48,7 @@
 DATA_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/102flowers.tgz'
 LABEL_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/imagelabels.mat'
 SETID_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/setid.mat'
-DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
+DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
 LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
 SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
 # In official 'readme', tstid is the flag of test data

From 01eec0af91a5dffc4cdbf462f48f4effe8fc4db9 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 22 Aug 2018 23:00:08 +0800
Subject: [PATCH 12/62] Fix flowers dataset reading problem

---
 python/paddle/dataset/flowers.py                   | 5 ++++-
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 1 -
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index ce0cd6009a947..8c9c721b33c7e 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -120,7 +120,10 @@ def reader():
                 file = file.strip()
                 batch = None
                 with open(file, 'rb') as f:
-                    batch = pickle.loads(f.read())
+                    if six.PY2:
+                        batch = pickle.load(f)
+                    else:
+                        batch = pickle.load(f, encoding='bytes')
                 data = batch['data']
                 labels = batch['label']
                 for sample, label in zip(data, batch['label']):
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index ae13d7ff31b0f..e7dd85ef5c364 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -64,7 +64,6 @@ if(WITH_DISTRIBUTE)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
-set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 500)
 py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
 py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)

From 8b8f6487d9b7ed78bbc8c10fddbc217f4dfcd030 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 23 Aug 2018 00:17:15 +0800
Subject: [PATCH 13/62] Add debug info for fetch feed

---
 paddle/scripts/paddle_build.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 8460f93b841fe..a55a9e89f7613 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -313,7 +313,9 @@ function run_test() {
     Running unit tests ...
     ========================================
 EOF
-        ctest --output-on-failure
+        echo $http_proxy
+        echo $https_proxy
+        ctest -V
         # make install should also be test when unittest
         make install -j `nproc`
         pip install /usr/local/opt/paddle/share/wheels/*.whl

From c95ff1c410165f8c97972dedaa81c079b19f8721 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 23 Aug 2018 00:18:30 +0800
Subject: [PATCH 14/62] Add debug info

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index e7dd85ef5c364..0c9bbb766f0cd 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -64,6 +64,7 @@ if(WITH_DISTRIBUTE)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
+set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 600)
 py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
 py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)

From 79918a84429d7dab4eff9487002a7eb01d4f2aaf Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Wed, 22 Aug 2018 20:06:48 +0800
Subject: [PATCH 15/62] add sequence_mask_op for DAM model

---
 paddle/fluid/API.spec                         |   1 +
 paddle/fluid/operators/batch_norm_op.cc       |   2 +-
 paddle/fluid/operators/sequence_mask_op.cc    |  26 ++++
 paddle/fluid/operators/sequence_mask_op.cu    |  22 ++++
 paddle/fluid/operators/sequence_mask_op.h     | 117 ++++++++++++++++++
 python/paddle/fluid/layers/nn.py              |  22 +++-
 python/paddle/fluid/nets.py                   |   2 +-
 .../tests/book/test_image_classification.py   |   5 +-
 .../tests/unittests/test_sequence_mask.py     |  86 +++++++++++++
 9 files changed, 278 insertions(+), 5 deletions(-)
 create mode 100644 paddle/fluid/operators/sequence_mask_op.cc
 create mode 100644 paddle/fluid/operators/sequence_mask_op.cu
 create mode 100644 paddle/fluid/operators/sequence_mask_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_sequence_mask.py

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 9250cde1b2bc8..359db26ed6334 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -162,6 +162,7 @@ paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
+paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'max_len', 'mask_dtype'], varargs=None, keywords=None, defaults=('int64',))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 5912a1a17cbd2..969f75544fa42 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -135,7 +135,7 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Variance",
              "The global variance (for training) "
              "or estimated Variance (for testing)");
-    AddOutput("Y", "result after normalization").Reuse("X");
+    AddOutput("Y", "result after normalization");
     AddOutput("MeanOut",
               "Share memory with Mean. "
               "Store the global mean when training")
diff --git a/paddle/fluid/operators/sequence_mask_op.cc b/paddle/fluid/operators/sequence_mask_op.cc
new file mode 100644
index 0000000000000..e45c18d6aff65
--- /dev/null
+++ b/paddle/fluid/operators/sequence_mask_op.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/sequence_mask_op.h"
+
+REGISTER_OPERATOR(sequence_mask, paddle::operators::SequenceMaskOp,
+                  paddle::operators::SequenceMaskOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    sequence_mask,
+    paddle::operators::SequenceMaskKernel<paddle::platform::CPUDeviceContext,
+                                          int>,
+    paddle::operators::SequenceMaskKernel<paddle::platform::CPUDeviceContext,
+                                          int64_t>);
diff --git a/paddle/fluid/operators/sequence_mask_op.cu b/paddle/fluid/operators/sequence_mask_op.cu
new file mode 100644
index 0000000000000..ff5acf4d9edd5
--- /dev/null
+++ b/paddle/fluid/operators/sequence_mask_op.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/sequence_mask_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    sequence_mask,
+    paddle::operators::SequenceMaskKernel<paddle::platform::CUDADeviceContext,
+                                          int>,
+    paddle::operators::SequenceMaskKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t>);
diff --git a/paddle/fluid/operators/sequence_mask_op.h b/paddle/fluid/operators/sequence_mask_op.h
new file mode 100644
index 0000000000000..237857b51d911
--- /dev/null
+++ b/paddle/fluid/operators/sequence_mask_op.h
@@ -0,0 +1,117 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceMaskOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist");
+    auto max_len = ctx->Attrs().Get<int>("max_len");
+    PADDLE_ENFORCE_GT(max_len, 1, "Attr(max_len) must be larger than 1");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist");
+    auto dim = framework::vectorize2int(ctx->GetInputDim("X"));
+    dim.push_back(max_len);
+    ctx->SetOutputDim("Y", framework::make_ddim(dim));
+  }
+};
+
+class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input of sequence_mask op.");
+    AddOutput("Y", "The output mask of sequence_mask op.");
+    AddAttr<int>("max_len", "The maximum length of the sequence.")
+        .GreaterThan(1);
+    AddAttr<int>("out_dtype", "Output data type");
+    AddComment(R"DOC(
+SequenceMask Operator
+
+This operator outputs a Mask according to Input(X) and Attr(max_len).
+Supposing Input(X) is a Tensor with shape [d_1, d_2, ..., d_n], the
+Output(Y) is a mask with shape [d_1, d_2, ..., d_n, max_len], where:
+
+Y(i_1, i_2, ..., i_n, j) = (j < X(i_1, i_2, ..., i_n)) 
+    )DOC");
+  }
+};
+
+template <typename Tx, typename Ty>
+struct SequenceMaskForRangeFunctor {
+  HOSTDEVICE SequenceMaskForRangeFunctor(const Tx *x, Ty *y, int max_len)
+      : x_(x), y_(y), max_len_(max_len) {}
+
+  HOSTDEVICE void operator()(int y_idx) const {
+    int x_idx = y_idx / max_len_;
+    int j = y_idx % max_len_;
+    y_[y_idx] = static_cast<Ty>(j < x_[x_idx] ? 1 : 0);
+  }
+
+ private:
+  const Tx *x_;
+  Ty *y_;
+  int max_len_;
+};
+
+template <typename DeviceContext, typename Tx>
+struct SequenceMaskFunctor {
+  using Tensor = framework::LoDTensor;
+
+  SequenceMaskFunctor(const DeviceContext &ctx, const Tx *x, Tensor *y,
+                      int limits, int max_len)
+      : ctx_(ctx), x_(x), y_(y), limits_(limits), max_len_(max_len) {}
+
+  template <typename Ty>
+  void operator()() const {
+    auto *y_data = y_->mutable_data<Ty>(ctx_.GetPlace());
+    platform::ForRange<DeviceContext> for_range(ctx_, limits_);
+    for_range(SequenceMaskForRangeFunctor<Tx, Ty>(x_, y_data, max_len_));
+  }
+
+ private:
+  const DeviceContext &ctx_;
+  const Tx *x_;
+  Tensor *y_;
+  int limits_;
+  int max_len_;
+};
+
+template <typename DeviceContext, typename Tx>
+class SequenceMaskKernel : public framework::OpKernel<Tx> {
+  using Tensor = framework::LoDTensor;
+
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<Tensor>("X");
+    auto *y = ctx.Output<Tensor>("Y");
+    auto max_len = ctx.Attr<int>("max_len");
+    auto out_dtype = static_cast<framework::proto::VarType::Type>(
+        ctx.Attr<int>("out_dtype"));
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    framework::VisitDataType(out_dtype, SequenceMaskFunctor<DeviceContext, Tx>(
+                                            dev_ctx, x->data<Tx>(), y,
+                                            x->numel() * max_len, max_len));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 71592618f540a..1fe457452f008 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -27,6 +27,7 @@
 import random
 from .. import unique_name
 from functools import reduce
+import warnings
 
 __all__ = [
     'fc',
@@ -103,6 +104,7 @@
     'rank_loss',
     'prelu',
     'flatten',
+    'sequence_mask',
 ]
 
 
@@ -2046,7 +2048,7 @@ def batch_norm(input,
         param_attr(ParamAttr): The parameter attribute for Parameter `scale`.
         bias_attr(ParamAttr): The parameter attribute for Parameter `bias`.
         data_layout(string, default NCHW): NCHW|NHWC
-        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
+        in_place(bool, Default False): This argument is deprecated since 0.15.0.
         use_mkldnn(bool, Default false): ${use_mkldnn_comment}
         name(string, Default None): A name for this layer(optional). If set None, the layer
             will be named automatically.
@@ -2068,6 +2070,10 @@ def batch_norm(input,
     helper = LayerHelper('batch_norm', **locals())
     dtype = helper.input_dtype()
 
+    if in_place:
+        raise warnings.warn("The argument in_place is deprecated since 0.15.0, "
+                            "please do not set it True.")
+
     input_shape = input.shape
     if data_layout == 'NCHW':
         channel_num = input_shape[1]
@@ -2117,7 +2123,7 @@ def batch_norm(input,
     saved_mean = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
     saved_variance = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
 
-    batch_norm_out = input if in_place else helper.create_tmp_variable(dtype)
+    batch_norm_out = helper.create_tmp_variable(dtype)
 
     helper.append_op(
         type="batch_norm",
@@ -5517,3 +5523,15 @@ def flatten(x, axis=1, name=None):
         outputs={'Out': out},
         attrs={"axis": axis})
     return out
+
+
+def sequence_mask(x, max_len, mask_dtype='int64'):
+    helper = LayerHelper('sequence_mask', **locals())
+    y = helper.create_tmp_variable(dtype=mask_dtype)
+    helper.append_op(
+        type='sequence_mask',
+        inputs={'X': [x]},
+        outputs={'Y': y},
+        attrs={'max_len': max_len,
+               'out_dtype': y.dtype})
+    return y
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 051fe84364639..01563cbbb706d 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -229,7 +229,7 @@ def __extend_list__(obj):
             use_mkldnn=use_mkldnn)
 
         if conv_with_batchnorm[i]:
-            tmp = layers.batch_norm(input=tmp, act=conv_act, in_place=True)
+            tmp = layers.batch_norm(input=tmp, act=conv_act)
             drop_rate = conv_batchnorm_drop_rate[i]
             if abs(drop_rate) > 1e-5:
                 tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index 9fe361425c128..cd1e8cd682315 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -256,7 +256,10 @@ def main(net_type, use_cuda, is_local=True):
     save_dirname = "image_classification_" + net_type + ".inference.model"
 
     train(net_type, use_cuda, save_dirname, is_local)
-    infer(use_cuda, save_dirname)
+
+    # There is bug in fluid.InferenceTranspiler for VGG.
+    if net_type == "resnet":
+        infer(use_cuda, save_dirname)
 
 
 class TestImageClassification(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_mask.py b/python/paddle/fluid/tests/unittests/test_sequence_mask.py
new file mode 100644
index 0000000000000..c6d09df984cc3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sequence_mask.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from op_test import OpTest
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+import numpy as np
+import copy
+import unittest
+
+
+class SequenceMaskTestBase(OpTest):
+    def initDefaultParameters(self):
+        self.op_type = 'sequence_mask'
+        self.max_len = 10
+        self.mask_dtype = 'int64'
+        self.x = [[0, 3, 4], [5, 7, 9]]
+
+    def initParameters(self):
+        pass
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.initParameters()
+        if not isinstance(self.x, np.ndarray):
+            self.x = np.array(self.x)
+
+        self.inputs = {'X': self.x}
+        self.outputs = {'Y': self.calc_ground_truth_mask()}
+        self.attrs = {
+            'max_len': self.max_len,
+            'out_dtype': convert_np_dtype_to_dtype_(self.mask_dtype)
+        }
+
+    def calc_ground_truth_mask(self):
+        shape = self.x.shape + (self.max_len, )
+        index_broadcast = np.broadcast_to(
+            np.reshape(
+                range(self.max_len), newshape=[1] * self.x.ndim + [-1]),
+            shape=shape)
+        x_broadcast = np.broadcast_to(
+            np.reshape(
+                self.x, newshape=self.x.shape + (-1, )), shape=shape)
+        return (index_broadcast < x_broadcast).astype(self.mask_dtype)
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class SequenceMaskTest1(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'bool'
+
+
+class SequenceMaskTest2(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'uint8'
+
+
+class SequenceMaskTest3(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'int32'
+
+
+class SequenceMaskTest4(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'float32'
+
+
+class SequenceMaskTest5(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'float64'
+
+
+if __name__ == '__main__':
+    unittest.main()

From 80e3ce411d16052766aca33d702b31cb0ec81419 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 23 Aug 2018 13:51:47 +0800
Subject: [PATCH 16/62] For test

---
 paddle/scripts/paddle_build.sh                     | 3 ++-
 python/paddle/dataset/flowers.py                   | 1 +
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index a55a9e89f7613..02bf8533d86dd 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -315,7 +315,8 @@ function run_test() {
 EOF
         echo $http_proxy
         echo $https_proxy
-        ctest -V
+        ctest -R test_parallel_executor_fetch_feed -V
+        ctest -R test_dist_se_resnext -V
         # make install should also be test when unittest
         make install -j `nproc`
         pip install /usr/local/opt/paddle/share/wheels/*.whl
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index 8c9c721b33c7e..c4a3eb55dd16c 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -41,6 +41,7 @@
 import os
 import numpy as np
 from multiprocessing import cpu_count
+import six
 from six.moves import cPickle as pickle
 from six.moves import zip
 __all__ = ['train', 'test', 'valid']
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 0c9bbb766f0cd..228a5ab917b30 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -64,7 +64,7 @@ if(WITH_DISTRIBUTE)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
-set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 600)
+set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 300)
 py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
 py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)

From 709c37023ae8cf301cc460b665655311523e8b52 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 23 Aug 2018 17:18:14 +0800
Subject: [PATCH 17/62] Polish code

---
 paddle/scripts/paddle_build.sh                     | 5 +----
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 02bf8533d86dd..8460f93b841fe 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -313,10 +313,7 @@ function run_test() {
     Running unit tests ...
     ========================================
 EOF
-        echo $http_proxy
-        echo $https_proxy
-        ctest -R test_parallel_executor_fetch_feed -V
-        ctest -R test_dist_se_resnext -V
+        ctest --output-on-failure
         # make install should also be test when unittest
         make install -j `nproc`
         pip install /usr/local/opt/paddle/share/wheels/*.whl
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 228a5ab917b30..8ac1cb164e158 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -64,7 +64,7 @@ if(WITH_DISTRIBUTE)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
-set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 300)
+set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150)
 py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
 py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)

From 0dc5d9c2157bd95479bff67181d05c105e623aa3 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 23 Aug 2018 17:36:54 +0800
Subject: [PATCH 18/62] Port print_siignatures

---
 tools/print_signatures.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index 5e7ffd44c7b0b..e2805c4e7e6aa 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -17,6 +17,8 @@
 Usage:
     ./print_signature  "paddle.fluid" > signature.txt
 """
+from __future__ import print_function
+
 import importlib
 import inspect
 import collections
@@ -64,4 +66,4 @@ def visit_all_module(mod):
 visit_all_module(importlib.import_module(sys.argv[1]))
 
 for name in member_dict:
-    print name, member_dict[name]
+    print(name, member_dict[name])

From e895c98f0ae43853e8150594c8ff1cc03a7663b8 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 23 Aug 2018 07:45:17 +0000
Subject: [PATCH 19/62] add support to max_len is None

---
 paddle/fluid/API.spec                         |  2 +-
 paddle/fluid/operators/sequence_mask_op.h     | 83 ++++++++++++++-----
 python/paddle/fluid/layers/nn.py              | 45 ++++++++--
 .../tests/unittests/test_sequence_mask.py     | 16 +++-
 4 files changed, 112 insertions(+), 34 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 359db26ed6334..01b6053524866 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -162,7 +162,7 @@ paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
-paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'max_len', 'mask_dtype'], varargs=None, keywords=None, defaults=('int64',))
+paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
diff --git a/paddle/fluid/operators/sequence_mask_op.h b/paddle/fluid/operators/sequence_mask_op.h
index 237857b51d911..0dd554adfe57e 100644
--- a/paddle/fluid/operators/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_mask_op.h
@@ -14,6 +14,14 @@
 
 #pragma once
 
+#ifdef __NVCC__
+#include <thrust/device_ptr.h>
+#include <thrust/functional.h>
+#include <thrust/reduce.h>
+#else
+#include <algorithm>
+#endif
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/for_range.h"
 
@@ -26,50 +34,60 @@ class SequenceMaskOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist");
-    auto max_len = ctx->Attrs().Get<int>("max_len");
-    PADDLE_ENFORCE_GT(max_len, 1, "Attr(max_len) must be larger than 1");
     PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist");
-    auto dim = framework::vectorize2int(ctx->GetInputDim("X"));
-    dim.push_back(max_len);
-    ctx->SetOutputDim("Y", framework::make_ddim(dim));
+
+    auto maxlen = ctx->Attrs().Get<int>("maxlen");
+    if (maxlen > 0) {  // We can only infershape when maxlen > 0
+      auto dim = framework::vectorize2int(ctx->GetInputDim("X"));
+      dim.push_back(maxlen);
+      ctx->SetOutputDim("Y", framework::make_ddim(dim));
+    }
   }
 };
 
 class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "The input of sequence_mask op.");
+    AddInput("X", "The input tensor of sequence_mask op.");
     AddOutput("Y", "The output mask of sequence_mask op.");
-    AddAttr<int>("max_len", "The maximum length of the sequence.")
-        .GreaterThan(1);
+    AddAttr<int>("maxlen",
+                 "The maximum length of the sequence. If maxlen < 0, maxlen "
+                 "= max(Input(X)).")
+        .SetDefault(-1)
+        .AddCustomChecker([](int &v) {
+          PADDLE_ENFORCE(v < 0 || v >= 1,
+                         "Attr(maxlen) must be less than 0 or larger than 1");
+        });
     AddAttr<int>("out_dtype", "Output data type");
     AddComment(R"DOC(
 SequenceMask Operator
 
-This operator outputs a Mask according to Input(X) and Attr(max_len).
+This operator outputs a Mask according to Input(X) and Attr(maxlen).
 Supposing Input(X) is a Tensor with shape [d_1, d_2, ..., d_n], the
-Output(Y) is a mask with shape [d_1, d_2, ..., d_n, max_len], where:
+Output(Y) is a mask with shape [d_1, d_2, ..., d_n, maxlen], where:
 
 Y(i_1, i_2, ..., i_n, j) = (j < X(i_1, i_2, ..., i_n)) 
+
+If maxlen < 0, maxlen = max(X)
     )DOC");
   }
 };
 
 template <typename Tx, typename Ty>
 struct SequenceMaskForRangeFunctor {
-  HOSTDEVICE SequenceMaskForRangeFunctor(const Tx *x, Ty *y, int max_len)
-      : x_(x), y_(y), max_len_(max_len) {}
+  HOSTDEVICE SequenceMaskForRangeFunctor(const Tx *x, Ty *y, int maxlen)
+      : x_(x), y_(y), maxlen_(maxlen) {}
 
   HOSTDEVICE void operator()(int y_idx) const {
-    int x_idx = y_idx / max_len_;
-    int j = y_idx % max_len_;
+    int x_idx = y_idx / maxlen_;
+    int j = y_idx % maxlen_;
     y_[y_idx] = static_cast<Ty>(j < x_[x_idx] ? 1 : 0);
   }
 
  private:
   const Tx *x_;
   Ty *y_;
-  int max_len_;
+  int maxlen_;
 };
 
 template <typename DeviceContext, typename Tx>
@@ -77,14 +95,14 @@ struct SequenceMaskFunctor {
   using Tensor = framework::LoDTensor;
 
   SequenceMaskFunctor(const DeviceContext &ctx, const Tx *x, Tensor *y,
-                      int limits, int max_len)
-      : ctx_(ctx), x_(x), y_(y), limits_(limits), max_len_(max_len) {}
+                      int limits, int maxlen)
+      : ctx_(ctx), x_(x), y_(y), limits_(limits), maxlen_(maxlen) {}
 
   template <typename Ty>
   void operator()() const {
     auto *y_data = y_->mutable_data<Ty>(ctx_.GetPlace());
     platform::ForRange<DeviceContext> for_range(ctx_, limits_);
-    for_range(SequenceMaskForRangeFunctor<Tx, Ty>(x_, y_data, max_len_));
+    for_range(SequenceMaskForRangeFunctor<Tx, Ty>(x_, y_data, maxlen_));
   }
 
  private:
@@ -92,7 +110,7 @@ struct SequenceMaskFunctor {
   const Tx *x_;
   Tensor *y_;
   int limits_;
-  int max_len_;
+  int maxlen_;
 };
 
 template <typename DeviceContext, typename Tx>
@@ -103,13 +121,32 @@ class SequenceMaskKernel : public framework::OpKernel<Tx> {
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto *x = ctx.Input<Tensor>("X");
     auto *y = ctx.Output<Tensor>("Y");
-    auto max_len = ctx.Attr<int>("max_len");
+    auto maxlen = ctx.Attr<int>("maxlen");
+
+    auto *x_data = x->data<Tx>();
+    auto x_numel = x->numel();
+    if (maxlen < 0) {
+#ifdef __NVCC__
+      VLOG(10)
+          << "SequenceMaskOp on GPU may be slow when maxlen is not provided.";
+      maxlen = static_cast<int>(
+          thrust::reduce(thrust::device_pointer_cast(x_data),
+                         thrust::device_pointer_cast(x_data) + x_numel,
+                         static_cast<Tx>(0), thrust::maximum<Tx>()));
+#else
+      maxlen = static_cast<int>(*std::max_element(x_data, x_data + x_numel));
+#endif
+      auto y_dim = framework::vectorize2int(x->dims());
+      y_dim.push_back(maxlen);
+      y->Resize(framework::make_ddim(y_dim));
+    }
+
     auto out_dtype = static_cast<framework::proto::VarType::Type>(
         ctx.Attr<int>("out_dtype"));
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    framework::VisitDataType(out_dtype, SequenceMaskFunctor<DeviceContext, Tx>(
-                                            dev_ctx, x->data<Tx>(), y,
-                                            x->numel() * max_len, max_len));
+    framework::VisitDataType(out_dtype,
+                             SequenceMaskFunctor<DeviceContext, Tx>(
+                                 dev_ctx, x_data, y, x_numel * maxlen, maxlen));
   }
 };
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 1fe457452f008..211f828d6ff13 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -5525,13 +5525,46 @@ def flatten(x, axis=1, name=None):
     return out
 
 
-def sequence_mask(x, max_len, mask_dtype='int64'):
+def sequence_mask(x, maxlen=None, dtype='int64', name=None):
+    """
+    **SequenceMask Layer**
+
+    This layer outputs a mask according to the input :code:`x` and
+    :code:`maxlen` with data type of :code:`dtype`.
+
+    Supposing :code:`x` is a Tensor with shape [d_1, d_2, ..., d_n], the
+    :code:`y` is a mask with shape [d_1, d_2, ..., d_n, maxlen], where:
+    
+    .. math::
+     
+        y(i_1, i_2,..., i_n, j) = (j < x(i_1, i_2,..., i_n))
+
+    Args:
+        x (Variable): Input tensor of sequence_mask layer, 
+                      whose elements are integers less than :code:`maxlen`.
+        maxlen (int|None): Maximum length of the sequence. If :code:`maxlen`
+                           is None, it would be replace with :math:`max(x)`.
+        dtype (np.dtype|core.VarDesc.VarType|str): Data type of the output.
+        name (str|None): A name for this layer(optional). If set None, the 
+                         layer will be named automatically.  
+    
+    Returns:
+        Variable: The output sequence mask.
+    
+    """
+
     helper = LayerHelper('sequence_mask', **locals())
-    y = helper.create_tmp_variable(dtype=mask_dtype)
+    if name is None:
+        out = helper.create_tmp_variable(dtype=dtype)
+    else:
+        out = helper.create_tmp_variable(dtype=dtype, name=name)
+
     helper.append_op(
         type='sequence_mask',
         inputs={'X': [x]},
-        outputs={'Y': y},
-        attrs={'max_len': max_len,
-               'out_dtype': y.dtype})
-    return y
+        outputs={'Y': out},
+        attrs={
+            'max_len': maxlen if maxlen is not None else -1,
+            'out_dtype': out.dtype
+        })
+    return out
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_mask.py b/python/paddle/fluid/tests/unittests/test_sequence_mask.py
index c6d09df984cc3..02c5b204082ec 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_mask.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_mask.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 
 from op_test import OpTest
+import paddle.fluid as fluid
 from paddle.fluid.framework import convert_np_dtype_to_dtype_
+import paddle.fluid.core as core
 import numpy as np
 import copy
 import unittest
@@ -22,7 +24,7 @@
 class SequenceMaskTestBase(OpTest):
     def initDefaultParameters(self):
         self.op_type = 'sequence_mask'
-        self.max_len = 10
+        self.maxlen = 10
         self.mask_dtype = 'int64'
         self.x = [[0, 3, 4], [5, 7, 9]]
 
@@ -38,15 +40,16 @@ def setUp(self):
         self.inputs = {'X': self.x}
         self.outputs = {'Y': self.calc_ground_truth_mask()}
         self.attrs = {
-            'max_len': self.max_len,
+            'maxlen': self.maxlen,
             'out_dtype': convert_np_dtype_to_dtype_(self.mask_dtype)
         }
 
     def calc_ground_truth_mask(self):
-        shape = self.x.shape + (self.max_len, )
+        maxlen = np.max(self.x) if self.maxlen < 0 else self.maxlen
+        shape = self.x.shape + (maxlen, )
         index_broadcast = np.broadcast_to(
             np.reshape(
-                range(self.max_len), newshape=[1] * self.x.ndim + [-1]),
+                range(maxlen), newshape=[1] * self.x.ndim + [-1]),
             shape=shape)
         x_broadcast = np.broadcast_to(
             np.reshape(
@@ -82,5 +85,10 @@ def initParameters(self):
         self.mask_dtype = 'float64'
 
 
+class SequenceMaskTest6(SequenceMaskTestBase):
+    def initParameters(self):
+        self.maxlen = -1
+
+
 if __name__ == '__main__':
     unittest.main()

From 41c10799b8165e67416f26728569377bc92e5775 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 23 Aug 2018 17:40:26 +0800
Subject: [PATCH 20/62] Port tools

---
 tools/check_ctest_hung.py | 4 +++-
 tools/timeline.py         | 5 +++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/tools/check_ctest_hung.py b/tools/check_ctest_hung.py
index 7de76c381b29a..c44690a93ac3c 100644
--- a/tools/check_ctest_hung.py
+++ b/tools/check_ctest_hung.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import sys
 import re
 
@@ -46,7 +48,7 @@ def main():
                 start_parts = escape(l).split(" ")
                 m = re.search("Start\s+[0-9]+\:\s([a-z0-9_]+)", escape(l))
                 started.add(m.group(1))
-    print "Diff: ", started - passed
+    print("Diff: ", started - passed)
 
 
 if __name__ == "__main__":
diff --git a/tools/timeline.py b/tools/timeline.py
index b413bb6fe0505..f850476831d84 100644
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -14,6 +14,7 @@
 
 import argparse
 import json
+import six
 import sys
 import unittest
 
@@ -124,7 +125,7 @@ def _allocate_pid(self):
         return cur_pid
 
     def _allocate_pids(self):
-        for k, profile_pb in self._profile_dict.iteritems():
+        for k, profile_pb in six.iteritems(self._profile_dict):
             for event in profile_pb.events:
                 if event.type == profiler_pb2.Event.CPU:
                     if (k, event.device_id, "CPU") not in self._devices:
@@ -140,7 +141,7 @@ def _allocate_pids(self):
                                                     (k, event.device_id), pid)
 
     def _allocate_events(self):
-        for k, profile_pb in self._profile_dict.iteritems():
+        for k, profile_pb in six.iteritems(self._profile_dict):
             for event in profile_pb.events:
                 if event.type == profiler_pb2.Event.CPU:
                     type = "CPU"

From 13686c44747f5a678ee10adf3cee4c509fe07d00 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 23 Aug 2018 17:41:16 +0800
Subject: [PATCH 21/62] Change to debug case

---
 paddle/scripts/paddle_build.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 8460f93b841fe..00fb0310e15be 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -313,7 +313,9 @@ function run_test() {
     Running unit tests ...
     ========================================
 EOF
-        ctest --output-on-failure
+        #ctest --output-on-failure
+        ctest -R test_parallel_executor_fetch_feed -V
+        ctest -R test_dist_se_resnext -V
         # make install should also be test when unittest
         make install -j `nproc`
         pip install /usr/local/opt/paddle/share/wheels/*.whl

From 2aac36b3f9bf47e7862091ba28ea925cf6ba346f Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 23 Aug 2018 19:03:15 +0800
Subject: [PATCH 22/62] For test

---
 paddle/scripts/paddle_build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 00fb0310e15be..d5af0eefe3d53 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -314,8 +314,8 @@ function run_test() {
     ========================================
 EOF
         #ctest --output-on-failure
-        ctest -R test_parallel_executor_fetch_feed -V
         ctest -R test_dist_se_resnext -V
+        ctest -R test_parallel_executor_fetch_feed -V
         # make install should also be test when unittest
         make install -j `nproc`
         pip install /usr/local/opt/paddle/share/wheels/*.whl

From 83f4edabe990bd496720a5dd098f3220dbdb337a Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Thu, 23 Aug 2018 19:58:14 +0800
Subject: [PATCH 23/62] remove broadcast in sequence_expand

---
 paddle/fluid/operators/sequence_expand_op.h | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index 39301e1ac0971..9228c81310463 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -53,25 +53,27 @@ struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
       const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
       LoDTensor* out) {
     int out_offset = 0;
-    auto& eigen_place = *context.eigen_device();
+    int x_item_length = x.numel() / x.dims()[0];
+    auto out_data = out->data<T>();
+    auto x_data = x.data<T>();
     for (size_t i = 1; i < ref_lod.size(); ++i) {
       int repeat_num = ref_lod[i] - ref_lod[i - 1];
       int x_start = x_lod[i - 1];
       int x_end = x_lod[i];
       int x_seq_len = x_end - x_start;
       if (repeat_num > 0) {
-        auto x_sub_tensor = x.Slice(x_start, x_end);
-        x_sub_tensor.Resize({1, x_sub_tensor.numel()});
         int out_start = out_offset;
         if (out->lod().size() == 1) {
           out_start = out->lod()[0][out_offset];
         }
-        auto out_sub_tensor =
-            out->Slice(out_start, out_start + x_seq_len * repeat_num);
-        out_sub_tensor.Resize({repeat_num, x_sub_tensor.dims()[1]});
-        EigenMatrix<T>::From(out_sub_tensor).device(eigen_place) =
-            EigenMatrix<T>::From(x_sub_tensor)
-                .broadcast(Eigen::array<int, 2>({{repeat_num, 1}}));
+        for (int j = 0; j < repeat_num; j++) {
+          for (int k = 0; k < x_seq_len; k++) {
+            for (int l = 0; l < x_item_length; l++) {
+              out_data[(out_start + j * x_seq_len + k) * x_item_length + l] =
+                  x_data[(x_start + k) * x_item_length + l];
+            }
+          }
+        }
       }
       out_offset += repeat_num;
     }

From 23bfdf9987c1105ebc067dd42b6ffd3ec8104b4e Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 23 Aug 2018 20:16:26 +0800
Subject: [PATCH 24/62] Port APISpec check

---
 paddle/scripts/paddle_build.sh | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index d5af0eefe3d53..4979bd55c11aa 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -331,7 +331,17 @@ function assert_api_not_changed() {
     virtualenv .env
     source .env/bin/activate
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
-    python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
+    if [ "$1" != "" ]; then
+        echo "checking python abi: $1"
+        if [ "$1" == "cp35-cp35m" ]; then
+            # Always use python2 to generate api signature
+            LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:} PATH=/opt/python/cp27-cp27mu/bin/:${PATH} python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
+        else
+            python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
+        fi
+    else
+        python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
+    fi
     python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec
     deactivate
 
@@ -625,7 +635,7 @@ function main() {
         gen_capi_package
         gen_fluid_inference_lib
         test_fluid_inference_lib
-        assert_api_not_changed
+        assert_api_not_changed ${PYTHON_ABI:-""}
         ;;
       *)
         print_usage

From 0d46f518aef8d8893f5f438475e6bc53b6f2b8bd Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 23 Aug 2018 14:32:46 +0800
Subject: [PATCH 25/62] refine avx condition and warning

---
 cmake/configure.cmake                | 22 ++++++++++++++--------
 paddle/fluid/platform/CMakeLists.txt |  2 +-
 paddle/fluid/platform/cpu_info.h     |  2 +-
 paddle/fluid/platform/init.cc        | 17 +++++++++++++++++
 4 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e03e15bfc017c..7e5d8a76217f1 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -50,14 +50,20 @@ if(NOT WITH_PROFILER)
 endif(NOT WITH_PROFILER)
 
 if(NOT CMAKE_CROSSCOMPILING)
-    if(WITH_AVX AND AVX512F_FOUND)
-        set(SIMD_FLAG ${AVX512F_FLAG})
-    elseif(WITH_AVX AND AVX2_FOUND)
-        set(SIMD_FLAG ${AVX2_FLAG})
-    elseif(WITH_AVX AND AVX_FOUND)
-        set(SIMD_FLAG ${AVX_FLAG})
-    elseif(SSE3_FOUND)
-        set(SIMD_FLAG ${SSE3_FLAG})
+    set(SIMD_FLAG)
+    if(WITH_AVX)
+        if (AVX512F_FOUND)
+            set(SIMD_FLAG "${SIMD_FLAG} ${AVX512F_FLAG}")
+        endif()
+        if (AVX2_FOUND)
+            set(SIMD_FLAG "${SIMD_FLAG} ${AVX2_FLAG}")
+        endif()
+        if (AVX_FOUND)
+            set(SIMD_FLAG "${SIMD_FLAG} ${AVX_FLAG}")
+        endif()
+        if (SSE3_FOUND)
+            set(SIMD_FLAG "${SIMD_FLAG} ${SSE3_FLAG}")
+        endif()
     endif()
 endif()
 
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index f08c0e8e34517..75d3856d0dda8 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -50,7 +50,7 @@ ENDIF()
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS malloc
-    place eigen3 stringpiece cpu_helper framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
 cc_test(init_test SRCS init_test.cc DEPS device_context)
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index 5d17978dd7946..30c8fbcfce92a 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -51,7 +51,7 @@ typedef enum {
 } cpu_isa_t;  // Instruction set architecture
 
 // May I use some instruction
-inline bool MayIUse(const cpu_isa_t cpu_isa);
+bool MayIUse(const cpu_isa_t cpu_isa);
 
 }  // namespace jit
 
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 6f1f0c4796f3b..020ce4d6f5941 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
@@ -120,6 +121,22 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
 #ifndef PADDLE_WITH_MKLDNN
   platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif
+
+  if (platform::jit::MayIUse(platform::jit::avx512_common)) {
+#ifndef __AVX512F__
+    LOG(WARNING) << "AVX512F is available, Please re-compile on local machine";
+#endif
+  }
+  if (platform::jit::MayIUse(platform::jit::avx2)) {
+#ifndef __AVX2__
+    LOG(WARNING) << "AVX2 is available, Please re-compile on local machine";
+#endif
+  }
+  if (platform::jit::MayIUse(platform::jit::avx)) {
+#ifndef __AVX__
+    LOG(WARNING) << "AVX is available, Please re-compile on local machine";
+#endif
+  }
 }
 
 void InitGLOG(const std::string &prog_name) {

From 2eb46c2b06ce745eba77489029198cef15eb9980 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 23 Aug 2018 18:10:11 +0800
Subject: [PATCH 26/62] add cpu vec test

---
 paddle/fluid/operators/math/CMakeLists.txt  |   1 +
 paddle/fluid/operators/math/cpu_vec.h       |  12 +-
 paddle/fluid/operators/math/cpu_vec_test.cc | 140 ++++++++++++++++++++
 3 files changed, 150 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/operators/math/cpu_vec_test.cc

diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index d2b772d11379c..1b75df5d7d97e 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -65,3 +65,4 @@ if(WITH_GPU)
     nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor math_function)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat)
+cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index 48c0da0e368a0..3575d9ca67c8a 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -15,6 +15,13 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include "paddle/fluid/platform/cpu_info.h"
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -22,7 +29,6 @@ namespace math {
 
 #define SIGMOID_THRESHOLD_MIN -40.0
 #define SIGMOID_THRESHOLD_MAX 13.0
-#define EXP_MAX_INPUT 40.0
 
 template <typename T>
 inline T sigmoid(T x) {
@@ -46,7 +52,7 @@ inline void vec_sigmoid(const int n, const T* x, T* y) {
   const T max = SIGMOID_THRESHOLD_MAX;
   for (int i = 0; i < n; ++i) {
     T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-    y[i] = 1.0 / (1.0 + std::exp(-tmp));
+    y[i] = sigmoid<T>(tmp);
   }
 }
 
@@ -96,7 +102,7 @@ class VecActivations {
     } else if (type == "identity" || type == "") {
       return vec_identity<T, isa>;
     }
-    PADDLE_THROW("Not support type %s.", type);
+    LOG(FATAL) << "Not support type: " << type;
   }
 };
 
diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc
new file mode 100644
index 0000000000000..773d4bec4f9c4
--- /dev/null
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
@@ -0,0 +1,140 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <sys/time.h>
+#include <cmath>
+#include <vector>
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/operators/math/cpu_vec.h"
+
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+constexpr int repeat = 1000;
+
+template <typename T>
+inline T _sigmoid(T x) {
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  T tmp = (x < min) ? min : ((x > max) ? max : x);
+  return 1. / (1. + std::exp(-tmp));
+}
+
+template <typename T>
+inline T _tanh(T x) {
+  return 2. * _sigmoid<T>(2. * x) - 1.;
+}
+
+template <typename T>
+void ref_sigmoid(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = _sigmoid(x[i]);
+  }
+}
+
+template <typename T>
+void ref_tanh(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = _tanh(x[i]);
+  }
+}
+template <typename T>
+void ref_relu(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <typename T>
+void RandomVec(const int n, T* a) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+  const T lower = static_cast<T>(-20.f);
+  const T upper = static_cast<T>(-20.f);
+  for (int i = 0; i < n; ++i) {
+    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+  }
+}
+
+template <typename T>
+void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
+                  std::function<void(const int, const T*, T*)> ref) {
+  std::vector<T> x(n);
+  std::vector<T> ytgt(n), yref(n);
+  RandomVec<T>(n, x.data());
+
+  const T* x_data = x.data();
+  T* ytgt_data = ytgt.data();
+  T* yref_data = yref.data();
+  auto st = GetCurrentUS();
+  for (int i = 0; i < repeat; ++i) {
+    tgt(n, x_data, ytgt_data);
+  }
+  auto mt = GetCurrentUS();
+  for (int i = 0; i < repeat; ++i) {
+    ref(n, x_data, yref_data);
+  }
+  auto et = GetCurrentUS();
+
+  VLOG(3) << "Vec size " << n << ": refer takes: " << (et - mt) / repeat
+          << " us, tgt takes: " << (mt - st) / repeat;
+  for (int i = 0; i < n; ++i) {
+    EXPECT_NEAR(ytgt_data[i], yref_data[i], 1e-3);
+  }
+}
+
+TEST(CpuVecTest, sigmoid) {
+  namespace jit = paddle::platform::jit;
+  using namespace paddle::operators::math;  // NOLINT
+  for (auto sz : {1, 2, 15, 16, 32, 128, 200, 512}) {
+    TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
+    TestAndBench<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>);
+    TestAndBench<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>);
+    TestAndBench<float>(sz, vec_sigmoid<float, jit::avx512_common>,
+                        ref_sigmoid<float>);
+  }
+  TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
+}
+
+TEST(CpuVecTest, tanh) {
+  namespace jit = paddle::platform::jit;
+  using namespace paddle::operators::math;  // NOLINT
+  for (auto sz : {1, 2, 15, 16, 32, 128, 200, 512}) {
+    TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>);
+    TestAndBench<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>);
+    TestAndBench<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>);
+    TestAndBench<float>(sz, vec_tanh<float, jit::avx512_common>,
+                        ref_tanh<float>);
+  }
+  TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>);
+}
+
+TEST(CpuVecTest, relu) {
+  namespace jit = paddle::platform::jit;
+  using namespace paddle::operators::math;  // NOLINT
+  for (auto sz : {1, 2, 15, 16, 32, 128, 200, 512}) {
+    TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>);
+    TestAndBench<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>);
+    TestAndBench<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>);
+    TestAndBench<float>(sz, vec_relu<float, jit::avx512_common>,
+                        ref_relu<float>);
+  }
+  TestAndBench<double>(30, vec_relu<double>, ref_relu<double>);
+}

From 3fd169daedb408fb922d6342f3f8b550ec1483b9 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 23 Aug 2018 21:32:51 +0800
Subject: [PATCH 27/62] Resume all tests

---
 paddle/scripts/paddle_build.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 4979bd55c11aa..49a66799bc253 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -313,9 +313,7 @@ function run_test() {
     Running unit tests ...
     ========================================
 EOF
-        #ctest --output-on-failure
-        ctest -R test_dist_se_resnext -V
-        ctest -R test_parallel_executor_fetch_feed -V
+        ctest --output-on-failure
         # make install should also be test when unittest
         make install -j `nproc`
         pip install /usr/local/opt/paddle/share/wheels/*.whl

From 25976fe736804e415a4f3b7fadc5c8ce3c9495f7 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 23 Aug 2018 21:50:35 +0800
Subject: [PATCH 28/62] optimize the sigmoid and tanh

---
 paddle/fluid/operators/math/cpu_vec.h       | 34 ++++++++++++++++-----
 paddle/fluid/operators/math/cpu_vec_test.cc |  5 +--
 2 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index 3575d9ca67c8a..6d8acbe5397cf 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <cmath>
 #include <string>
 #include "paddle/fluid/platform/cpu_info.h"
 #ifdef __AVX__
@@ -31,15 +32,24 @@ namespace math {
 #define SIGMOID_THRESHOLD_MAX 13.0
 
 template <typename T>
-inline T sigmoid(T x) {
-  return 1. / (1. + exp(-x));
+inline void vec_exp(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::exp(x[i]);
+  }
 }
 
-template <typename T>
-inline T tanh(T x) {
-  return 2. * sigmoid(2. * x) - 1.;
+#ifdef PADDLE_WITH_MKLML
+template <>
+inline void vec_exp<float>(const int n, const float* x, float* y) {
+  platform::dynload::vsExp(n, x, y);
 }
 
+template <>
+inline void vec_exp<double>(const int n, const double* x, double* y) {
+  platform::dynload::vdExp(n, x, y);
+}
+#endif
+
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 inline void vec_identity(const int n, const T* x, T* y) {
   // do nothing
@@ -51,15 +61,23 @@ inline void vec_sigmoid(const int n, const T* x, T* y) {
   const T min = SIGMOID_THRESHOLD_MIN;
   const T max = SIGMOID_THRESHOLD_MAX;
   for (int i = 0; i < n; ++i) {
-    T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-    y[i] = sigmoid<T>(tmp);
+    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = static_cast<T>(0) - y[i];
+  }
+  vec_exp<T>(n, y, y);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
   }
 }
 
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 inline void vec_tanh(const int n, const T* x, T* y) {
   for (int i = 0; i < n; ++i) {
-    y[i] = tanh<T>(x[i]);
+    y[i] = static_cast<T>(2) * x[i];
+  }
+  vec_exp<T>(n, y, y);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
   }
 }
 
diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc
index 773d4bec4f9c4..ab4858984d233 100644
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
@@ -33,12 +33,13 @@ inline T _sigmoid(T x) {
   const T min = SIGMOID_THRESHOLD_MIN;
   const T max = SIGMOID_THRESHOLD_MAX;
   T tmp = (x < min) ? min : ((x > max) ? max : x);
-  return 1. / (1. + std::exp(-tmp));
+  return static_cast<T>(1) / (static_cast<T>(1) + std::exp(-tmp));
 }
 
 template <typename T>
 inline T _tanh(T x) {
-  return 2. * _sigmoid<T>(2. * x) - 1.;
+  return static_cast<T>(2) * _sigmoid<T>(static_cast<T>(2) * x) -
+         static_cast<T>(1);
 }
 
 template <typename T>

From b1fc23869417d3bf6c1f647042c8ecfea58043b4 Mon Sep 17 00:00:00 2001
From: guochaorong <guochaorong@baidu.com>
Date: Thu, 23 Aug 2018 22:36:53 +0800
Subject: [PATCH 29/62] Revert "Disable in_place in batch_norm API. (#12736)"

This reverts commit f5d5d7b2d989e8aa5b5e637fd04318566b23f2fe.
---
 paddle/fluid/operators/batch_norm_op.cc                  | 2 +-
 python/paddle/fluid/layers/nn.py                         | 9 ++-------
 python/paddle/fluid/nets.py                              | 2 +-
 .../paddle/fluid/tests/book/test_image_classification.py | 5 +----
 4 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 969f75544fa42..5912a1a17cbd2 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -135,7 +135,7 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Variance",
              "The global variance (for training) "
              "or estimated Variance (for testing)");
-    AddOutput("Y", "result after normalization");
+    AddOutput("Y", "result after normalization").Reuse("X");
     AddOutput("MeanOut",
               "Share memory with Mean. "
               "Store the global mean when training")
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 83250f65e4fad..4bd260a00503c 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -27,7 +27,6 @@
 import random
 from .. import unique_name
 from functools import reduce
-import warnings
 
 __all__ = [
     'fc',
@@ -2048,7 +2047,7 @@ def batch_norm(input,
         param_attr(ParamAttr): The parameter attribute for Parameter `scale`.
         bias_attr(ParamAttr): The parameter attribute for Parameter `bias`.
         data_layout(string, default NCHW): NCHW|NHWC
-        in_place(bool, Default False): This argument is deprecated since 0.15.0.
+        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
         use_mkldnn(bool, Default false): ${use_mkldnn_comment}
         name(string, Default None): A name for this layer(optional). If set None, the layer
             will be named automatically.
@@ -2070,10 +2069,6 @@ def batch_norm(input,
     helper = LayerHelper('batch_norm', **locals())
     dtype = helper.input_dtype()
 
-    if in_place:
-        raise warnings.warn("The argument in_place is deprecated since 0.15.0, "
-                            "please do not set it True.")
-
     input_shape = input.shape
     if data_layout == 'NCHW':
         channel_num = input_shape[1]
@@ -2123,7 +2118,7 @@ def batch_norm(input,
     saved_mean = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
     saved_variance = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
 
-    batch_norm_out = helper.create_tmp_variable(dtype)
+    batch_norm_out = input if in_place else helper.create_tmp_variable(dtype)
 
     helper.append_op(
         type="batch_norm",
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 01563cbbb706d..051fe84364639 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -229,7 +229,7 @@ def __extend_list__(obj):
             use_mkldnn=use_mkldnn)
 
         if conv_with_batchnorm[i]:
-            tmp = layers.batch_norm(input=tmp, act=conv_act)
+            tmp = layers.batch_norm(input=tmp, act=conv_act, in_place=True)
             drop_rate = conv_batchnorm_drop_rate[i]
             if abs(drop_rate) > 1e-5:
                 tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index cd1e8cd682315..9fe361425c128 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -256,10 +256,7 @@ def main(net_type, use_cuda, is_local=True):
     save_dirname = "image_classification_" + net_type + ".inference.model"
 
     train(net_type, use_cuda, save_dirname, is_local)
-
-    # There is bug in fluid.InferenceTranspiler for VGG.
-    if net_type == "resnet":
-        infer(use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
 
 
 class TestImageClassification(unittest.TestCase):

From 1f270275a6d3b9c2a279609aa781e1cd30018523 Mon Sep 17 00:00:00 2001
From: guochaorong <guochaorong@baidu.com>
Date: Thu, 23 Aug 2018 22:59:59 +0800
Subject: [PATCH 30/62] Revert "Add Python Callstacks when Op::Run error
 (#12759)"

This reverts commit b2df17003f22712078df75b299fb27934650319d.
---
 paddle/fluid/framework/op_proto_maker.cc      |  4 --
 paddle/fluid/framework/op_proto_maker.h       |  1 -
 paddle/fluid/framework/operator.cc            | 61 +++++--------------
 paddle/fluid/operators/top_k_op.cc            |  2 -
 paddle/fluid/pybind/const_value.cc            |  3 -
 python/paddle/fluid/framework.py              |  5 --
 .../tests/unittests/test_operator_desc.py     |  5 +-
 7 files changed, 16 insertions(+), 65 deletions(-)

diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index 9c289243c5a27..2288c7fe6609a 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -129,10 +129,6 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
                                     "Optimized for variable")
       .SetDefault({});
 
-  AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
-                                    "Callstack for Op Creatation.")
-      .SetDefault({});
-
   Validate();
 }
 
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index cb9c8ab1704ab..80970291c9c23 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -39,7 +39,6 @@ class OpProtoAndCheckerMaker {
  public:
   static const char *OpRoleAttrName() { return "op_role"; }
   static const char *OpRoleVarAttrName() { return "op_role_var"; }
-  static const char *OpCreationCallstackAttrName() { return "op_callstack"; }
 
   void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 9f8cdf1aeba43..d04f7744961b2 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -11,17 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/framework/operator.h"
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
 #include <algorithm>
-#include <sstream>
-#include <string>
-#include <vector>
-#include "gflags/gflags.h"
-#include "glog/logging.h"
+
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -129,48 +127,19 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
 }
 
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  try {
-    if (VLOG_IS_ON(4)) {
-      VLOG(4) << place << " " << DebugStringEx(&scope);
-    }
-    if (platform::is_gpu_place(place)) {
+  VLOG(4) << place << " " << DebugStringEx(&scope);
+  if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("Cannot run operator on place %s", place);
+    PADDLE_THROW("Cannot run operator on place %s", place);
 #else
-      auto dev_id = boost::get<platform::CUDAPlace>(place).device;
-      platform::SetDeviceId(dev_id);
+    auto dev_id = boost::get<platform::CUDAPlace>(place).device;
+    platform::SetDeviceId(dev_id);
 #endif
-    }
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    platform::RecordEvent record_event(Type(), pool.Get(place));
-    RunImpl(scope, place);
-    if (VLOG_IS_ON(3)) {
-      VLOG(3) << place << " " << DebugStringEx(&scope);
-    }
-  } catch (platform::EnforceNotMet exception) {
-    if (Attrs().count("sub_block") != 0) {
-      throw exception;
-    }
-
-    auto& callstack = Attr<std::vector<std::string>>(
-        OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
-
-    if (callstack.empty()) {
-      throw exception;
-    }
-    std::ostringstream sout;
-    sout << "Invoke operator " << Type() << " error.\n";
-    sout << "Python Callstacks: \n";
-    for (auto& line : callstack) {
-      sout << line;
-    }
-    sout << "C++ Callstacks: \n";
-    sout << exception.err_str_;
-    exception.err_str_ = sout.str();
-    throw exception;
-  } catch (...) {
-    std::rethrow_exception(std::current_exception());
   }
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  platform::RecordEvent record_event(Type(), pool.Get(place));
+  RunImpl(scope, place);
+  VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 
 bool OperatorBase::HasInputs(const std::string& name) const {
@@ -198,7 +167,7 @@ const std::vector<std::string>& OperatorBase::Inputs(
 }
 
 bool OperatorBase::HasOutputs(const std::string& name) const {
-  if (outputs_.end() != outputs_.find(name)) {
+  if (outputs_.find(name) != outputs_.end()) {
     return true;
   } else {
     return false;
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index 92a0697e27ba0..4a8ac441cfaf6 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -30,8 +30,6 @@ class TopkOp : public framework::OperatorWithKernel {
                    "Output(Indices) of TopkOp should not be null.");
 
     auto input_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(input_dims.size(), 2,
-                      "Rank of TopK op's input must be 2.");
     const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
 
     PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index a81715c3b317a..e4415ed15c791 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -43,9 +43,6 @@ void BindConstValue(pybind11::module* m) {
   op_proto_and_checker_maker.def(
       "kOpRoleVarAttrName",
       framework::OpProtoAndCheckerMaker::OpRoleVarAttrName);
-  op_proto_and_checker_maker.def(
-      "kOpCreationCallstackAttrName",
-      framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName);
 }
 
 }  // namespace pybind
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index e0ddd3b5ffecf..febb750ee1af2 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -18,7 +18,6 @@
 import contextlib
 import re
 import six
-import traceback
 
 import numpy as np
 
@@ -506,10 +505,6 @@ def __init__(self,
         if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
             del op_attrs[role_var_name]
 
-        callstack_var_name = op_maker.kOpCreationCallstackAttrName()
-        op_attrs[callstack_var_name] = list(
-            reversed(traceback.format_stack()))[1:]
-
         if len(self.desc.type()) != 0:
             return
         if type is None:
diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
index 3ac82680733fe..6d01955993324 100644
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -67,10 +67,7 @@ def test_op_desc_creation(self):
         self.assertEqual(mul_op.output("Out"), ["mul.out"])
         self.assertEqual(
             set(mul_op.attr_names),
-            set([
-                "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var",
-                "op_callstack"
-            ]))
+            set(["x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var"]))
         self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
         self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
         self.assertEqual(mul_op.attr("x_num_col_dims"), 1)

From 0eccd59425c24fb3367c48d1545863c624d4c77b Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 23 Aug 2018 23:00:03 +0800
Subject: [PATCH 31/62] Keep APISpec the same with Python2

---
 paddle/scripts/paddle_build.sh | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 49a66799bc253..5dadef7e7657e 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -329,16 +329,11 @@ function assert_api_not_changed() {
     virtualenv .env
     source .env/bin/activate
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
-    if [ "$1" != "" ]; then
-        echo "checking python abi: $1"
-        if [ "$1" == "cp35-cp35m" ]; then
-            # Always use python2 to generate api signature
-            LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:} PATH=/opt/python/cp27-cp27mu/bin/:${PATH} python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
-        else
-            python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
-        fi
-    else
-        python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
+    python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
+    if [ "$1" == "cp35-cp35m" ]; then
+        # Use sed to make python2 and python3 sepc keeps the same
+        sed -i 's/arg0: str/arg0: unicode/g' new.spec
+        sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec
     fi
     python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec
     deactivate

From e3bb98eb38f8938ee3a0f8b07d8f486aca6ccfe3 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 24 Aug 2018 00:01:09 +0800
Subject: [PATCH 32/62] optimize relu with avx and avx512

---
 paddle/fluid/operators/math/cpu_vec.h | 83 ++++++++++++++++++++++++---
 1 file changed, 74 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index 6d8acbe5397cf..e74e84055a9f2 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -31,6 +31,13 @@ namespace math {
 #define SIGMOID_THRESHOLD_MIN -40.0
 #define SIGMOID_THRESHOLD_MAX 13.0
 
+#define AVX_FLOAT_BLOCK 8
+#define AVX_DOUBLE_BLOCK 4
+#define AVX2_FLOAT_BLOCK 8
+#define AVX2_DOUBLE_BLOCK 4
+#define AVX512_FLOAT_BLOCK 16
+#define AVX512_DOUBLE_BLOCK 8
+
 template <typename T>
 inline void vec_exp(const int n, const T* x, T* y) {
   for (int i = 0; i < n; ++i) {
@@ -88,24 +95,82 @@ inline void vec_relu(const int n, const T* x, T* y) {
   }
 }
 
+template <>
+inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
+                                                float* y) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_relu<float, platform::jit::isa_any>(n, x, y);
+    return;
+  }
+
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 zeros = _mm256_setzero_ps();
+  __m256 tmp;
+#define MOVE_ONE_STEP              \
+  tmp = _mm256_loadu_ps(x + i);    \
+  tmp = _mm256_max_ps(tmp, zeros); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+  if (rest == 0) {
+    return;
+  }
+  i = n - block;
+  MOVE_ONE_STEP;
+#undef MOVE_ONE_STEP
+
+#else
+  vec_relu<float, platform::jit::isa_any>(n, x, y);
+#endif
+}
+
 template <>
 inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
                                                  float* y) {
-  // TODO(TJ): complete me
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] > 0 ? x[i] : 0;
-  }
+  vec_relu<float, platform::jit::avx>(n, x, y);
 }
 
 template <>
-inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
-                                                float* y) {
-  // TODO(TJ): complete me
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] > 0 ? x[i] : 0;
+inline void vec_relu<float, platform::jit::avx512_common>(const int n,
+                                                          const float* x,
+                                                          float* y) {
+#ifdef __AVX512F__
+  // test me
+  constexpr int block = AVX512_FLOAT_BLOCK;
+  if (n < block) {
+    vec_relu<float, platform::jit::avx2>(n, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m512 zeros = _mm512_setzero_ps();
+  __m512 tmp;
+#define MOVE_ONE_STEP              \
+  tmp = _mm512_loadu_ps(x + i);    \
+  tmp = _mm512_max_ps(tmp, zeros); \
+  _mm512_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
   }
+  if (rest == 0) {
+    return;
+  }
+  i = n - block;
+  MOVE_ONE_STEP;
+#undef MOVE_ONE_STEP
+#else
+  vec_relu<float, platform::jit::avx2>(n, x, y);
+#endif
 }
 
+// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
+
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 class VecActivations {
  public:

From 6bd89ba5b6966f9c328cbf3fe187a5768c5e0664 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 24 Aug 2018 00:47:17 +0800
Subject: [PATCH 33/62] fix typo

---
 paddle/fluid/operators/math/cpu_vec.h       | 2 +-
 paddle/fluid/operators/math/cpu_vec_test.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index e74e84055a9f2..a2e2b5a7fed03 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -82,7 +82,7 @@ inline void vec_tanh(const int n, const T* x, T* y) {
   for (int i = 0; i < n; ++i) {
     y[i] = static_cast<T>(2) * x[i];
   }
-  vec_exp<T>(n, y, y);
+  vec_sigmoid<T>(n, y, y);
   for (int i = 0; i < n; ++i) {
     y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
   }
diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc
index ab4858984d233..0888e44fa655f 100644
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
@@ -68,7 +68,7 @@ void RandomVec(const int n, T* a) {
   std::mt19937 rng(seed++);
   std::uniform_real_distribution<double> uniform_dist(0, 1);
   const T lower = static_cast<T>(-20.f);
-  const T upper = static_cast<T>(-20.f);
+  const T upper = static_cast<T>(20.f);
   for (int i = 0; i < n; ++i) {
     a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
   }

From a7849db561df9bf7a2c5961df63a861106f90b43 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 24 Aug 2018 00:47:41 +0800
Subject: [PATCH 34/62] Port new added code

---
 python/paddle/fluid/tests/unittests/test_attention_lstm_op.py | 2 +-
 python/paddle/fluid/transpiler/details/program_utils.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
index a7382c2244ec3..1b9c3efe0fa9e 100644
--- a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
@@ -37,7 +37,7 @@ def attention_lstm(
     T = sum(lod[0])
     N = len(lod[0])
     M = x.shape[1]
-    D = b.shape[1] / 4
+    D = b.shape[1] // 4
     assert T == x.shape[0]
     assert len(fcws) == len(fcbs)
     hidden = []
diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py
index 420ae6dfd4b75..64863aceee11c 100644
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -159,7 +159,7 @@ def program_to_code(prog):
             get_indent_space(indent), '{', block_idx))
         indent += 1
         # sort all vars
-        all_vars = sorted(block.vars.iteritems(), key=lambda x: x[0])
+        all_vars = sorted(six.iteritems(block.vars), key=lambda x: x[0])
         for var in all_vars:
             print("{}{}".format(
                 get_indent_space(indent), variable_to_code(var[1])))

From ca22586818c2ce9d9b4ac83f49a3c7a54570cc6b Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 24 Aug 2018 10:51:37 +0800
Subject: [PATCH 35/62] code optimize

(cherry picked from commit 587cca7)
---
 paddle/fluid/operators/fill_constant_op.cc  | 27 +++++++++++++++------
 paddle/fluid/operators/uniform_random_op.cc |  2 +-
 paddle/fluid/operators/uniform_random_op.cu |  2 +-
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 130f18dde4f97..2826b82117db1 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -41,19 +40,33 @@ class FillConstantOp : public framework::OperatorBase {
         static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
     auto value = Attr<float>("value");
     auto force_cpu = Attr<bool>("force_cpu");
-    auto &out =
-        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
-    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+
+    framework::Tensor *tensor = nullptr;
+
+    auto &out_var = *scope.FindVar(Output("Out"));
+
+    if (out_var.IsType<framework::LoDTensor>()) {
+      tensor = out_var.GetMutable<framework::LoDTensor>();
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    } else if (out_var.IsType<framework::SelectedRows>()) {
+      tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    } else {
+      PADDLE_THROW(
+          "fill constant op's output only"
+          "supports SelectedRows and LoDTensor");
+    }
+
     if (force_cpu) {
       auto cpu = platform::CPUPlace();
-      out.mutable_data(cpu, framework::ToTypeIndex(data_type));
+      tensor->mutable_data(cpu, framework::ToTypeIndex(data_type));
     } else {
-      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
+      tensor->mutable_data(dev_place, framework::ToTypeIndex(data_type));
     }
 
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(dev_place);
-    math::set_constant(dev_ctx, &out, value);
+    math::set_constant(dev_ctx, tensor, value);
   }
 };
 
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 5248767c2eeb9..763bb403588d1 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -37,7 +37,7 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
     } else {
       PADDLE_THROW(
           "uniform_random_op's output only"
-          "supports SelectedRows and Tensor");
+          "supports SelectedRows and LoDTensor");
     }
     T* data = tensor->mutable_data<T>(ctx.GetPlace());
     unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index e1c7323a30233..bbb692b0ddfc1 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -54,7 +54,7 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     } else {
       PADDLE_THROW(
           "uniform_random_op's output only"
-          "supports SelectedRows and Tensor");
+          "supports SelectedRows and LoDTensor");
     }
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));

From fcf20eed0fba2c6576fd66139a9d3f134a0793c4 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Fri, 24 Aug 2018 10:57:40 +0800
Subject: [PATCH 36/62] fix sparse update bug

---
 paddle/fluid/operators/distributed/variable_response.cc | 1 +
 paddle/fluid/operators/listen_and_serv_op.cc            | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
index 8e38b3713f28b..1617cc1b95216 100644
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -151,6 +151,7 @@ bool VariableResponse::CopySelectRowsData(
     ::google::protobuf::io::CodedInputStream* input,
     const platform::DeviceContext& ctx, int length) {
   auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
+  slr->mutable_rows()->clear();
   slr->mutable_rows()->resize(length /
                               framework::SizeOfType(typeid(int64_t)));  // int64
   int64_t* rows_data = slr->mutable_rows()->data();
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index f196e18fe122a..4cc2159d9f228 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -165,12 +165,13 @@ void ListenAndServOp::RunSyncLoop(
                           recv_scope);
     VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
 
-    rpc_service_->SetCond(distributed::kRequestGet);
-    rpc_service_->WaitBarrier(distributed::kRequestGet);
-    rpc_service_->ResetBarrierCounter();
     // reset received sparse vars to avoid reuse it in the next mini-batch
     dynamic_cast<distributed::RequestSendHandler *>(request_send_handler_.get())
         ->ResetSparseVarRecorder();
+
+    rpc_service_->SetCond(distributed::kRequestGet);
+    rpc_service_->WaitBarrier(distributed::kRequestGet);
+    rpc_service_->ResetBarrierCounter();
   }  // while(true)
 }
 

From fca139b5e302c46a26d99d0b57546010d3c97590 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 24 Aug 2018 11:07:47 +0800
Subject: [PATCH 37/62] Fix flowers dataset download problem

---
 python/paddle/dataset/common.py  |  3 +++
 python/paddle/dataset/flowers.py | 14 +++++++++-----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
index 1d7ff582c86a4..ece4046f5b7a7 100644
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -19,6 +19,7 @@
 import os
 import errno
 import shutil
+import six
 import sys
 import importlib
 import paddle.dataset
@@ -94,6 +95,8 @@ def download(url, module_name, md5sum, save_name=None):
                 dl = 0
                 total_length = int(total_length)
                 for data in r.iter_content(chunk_size=4096):
+                    if six.PY2:
+                        data = six.b(data)
                     dl += len(data)
                     f.write(data)
                     done = int(50 * dl / total_length)
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index aa73bbaf7024e..0a1cdaceaf3be 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -35,6 +35,7 @@
 import functools
 from .common import download
 import tarfile
+import six
 import scipy.io as scio
 from paddle.dataset.image import *
 from paddle.reader import *
@@ -45,10 +46,10 @@
 from six.moves import zip
 __all__ = ['train', 'test', 'valid']
 
-DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
-LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
-SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
-DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
+DATA_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/102flowers.tgz'
+LABEL_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/imagelabels.mat'
+SETID_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/setid.mat'
+DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
 LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
 SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
 # In official 'readme', tstid is the flag of test data
@@ -120,7 +121,10 @@ def reader():
                 file = file.strip()
                 batch = None
                 with open(file, 'rb') as f:
-                    batch = pickle.load(f)
+                    if six.PY2:
+                        batch = pickle.load(f)
+                    else:
+                        batch = pickle.load(f, encoding='bytes')
                 data = batch['data']
                 labels = batch['label']
                 for sample, label in zip(data, batch['label']):

From 8f9bbc2834c35d368b680d87fe50342717d28d31 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 24 Aug 2018 11:27:54 +0800
Subject: [PATCH 38/62] add unit test

---
 .../tests/unittests/test_fill_constant_op.py  | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 44fb1d047dff4..b73711b19d613 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -18,6 +18,9 @@
 import numpy as np
 from op_test import OpTest
 
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
 
 class TestFillConstantOp1(OpTest):
     def setUp(self):
@@ -47,5 +50,27 @@ def test_check_output(self):
         self.check_output()
 
 
+class TestFillConstantOpWithSelectedRows(OpTest):
+    def check_with_place(self, place):
+        scope = core.Scope()
+        # create Out Variable
+        out = scope.var('Out').get_selected_rows()
+
+        # create and run fill_constant_op operator
+        fill_constant_op = Operator(
+            "fill_constant", shape=[123, 92], value=3.8, Out='Out')
+        fill_constant_op.run(scope, place)
+
+        # get result from Out
+        result_array = np.array(out)
+        self.assertEqual(result_array, np.full((123, 92), 3.8))
+
+    def test_fill_constant_with_selected_rows(self):
+        places = [core.CPUPlace()]
+        # currently only support CPU
+        for place in places:
+            self.check_with_place(place)
+
+
 if __name__ == "__main__":
     unittest.main()

From 7a4924cd44a47f3562d62c01d0c40e84ca78540e Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 24 Aug 2018 11:46:59 +0800
Subject: [PATCH 39/62] further optimize sigmoid with avx and avx512

---
 paddle/fluid/operators/math/cpu_vec.h       | 116 ++++++++++++++++++++
 paddle/fluid/operators/math/cpu_vec_test.cc |   6 +-
 2 files changed, 119 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index a2e2b5a7fed03..52f072eb0e01e 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -77,6 +77,122 @@ inline void vec_sigmoid(const int n, const T* x, T* y) {
   }
 }
 
+template <>
+inline void vec_sigmoid<float, platform::jit::avx>(const int n, const float* x,
+                                                   float* y) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {  // can use larger threshold if necessary
+    vec_sigmoid<float, platform::jit::isa_any>(n, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
+  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
+  __m256 zeros = _mm256_setzero_ps();
+  __m256 tmp;
+#define MOVE_ONE_STEP              \
+  tmp = _mm256_loadu_ps(x + i);    \
+  tmp = _mm256_max_ps(tmp, min);   \
+  tmp = _mm256_min_ps(tmp, max);   \
+  tmp = _mm256_sub_ps(zeros, tmp); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+  if (rest != 0) {
+    i = n - block;
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+
+  vec_exp<float>(n, y, y);
+
+  __m256 ones = _mm256_set1_ps(1.0f);
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(y + i);   \
+  tmp = _mm256_add_ps(ones, tmp); \
+  tmp = _mm256_div_ps(ones, tmp); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step
+  for (i = n - rest; i < n; ++i) {
+    y[i] = 1.f / (1.f + y[i]);
+  }
+#else
+  vec_sigmoid<float, platform::jit::isa_any>(n, x, y);
+#endif
+}
+
+template <>
+inline void vec_sigmoid<float, platform::jit::avx2>(const int n, const float* x,
+                                                    float* y) {
+  vec_sigmoid<float, platform::jit::avx>(n, x, y);
+}
+
+template <>
+inline void vec_sigmoid<float, platform::jit::avx512_common>(const int n,
+                                                             const float* x,
+                                                             float* y) {
+#ifdef __AVX512F__
+  constexpr int block = AVX512_FLOAT_BLOCK;
+  if (n < block) {
+    vec_sigmoid<float, platform::jit::isa_any>(n, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m512 max = _mm512_set1_ps(SIGMOID_THRESHOLD_MAX);
+  __m512 min = _mm512_set1_ps(SIGMOID_THRESHOLD_MIN);
+  __m512 zeros = _mm512_setzero_ps();
+  __m512 tmp;
+#define MOVE_ONE_STEP              \
+  tmp = _mm512_loadu_ps(x + i);    \
+  tmp = _mm512_max_ps(tmp, min);   \
+  tmp = _mm512_min_ps(tmp, max);   \
+  tmp = _mm512_sub_ps(zeros, tmp); \
+  _mm512_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+  if (rest != 0) {
+    i = n - block;
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+
+  vec_exp<float>(n, y, y);
+
+  __m512 ones = _mm512_set1_ps(1.0f);
+#define MOVE_ONE_STEP             \
+  tmp = _mm512_loadu_ps(y + i);   \
+  tmp = _mm512_add_ps(ones, tmp); \
+  tmp = _mm512_div_ps(ones, tmp); \
+  _mm512_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  for (i = n - rest; i < n; ++i) {
+    y[i] = 1.f / (1.f + y[i]);
+  }
+#else
+  vec_sigmoid<float, platform::jit::isa_any>(n, x, y);
+#endif
+}
+
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 inline void vec_tanh(const int n, const T* x, T* y) {
   for (int i = 0; i < n; ++i) {
diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc
index 0888e44fa655f..8b0e9c086a570 100644
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
@@ -104,7 +104,7 @@ void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
 TEST(CpuVecTest, sigmoid) {
   namespace jit = paddle::platform::jit;
   using namespace paddle::operators::math;  // NOLINT
-  for (auto sz : {1, 2, 15, 16, 32, 128, 200, 512}) {
+  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
     TestAndBench<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>);
     TestAndBench<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>);
@@ -117,7 +117,7 @@ TEST(CpuVecTest, sigmoid) {
 TEST(CpuVecTest, tanh) {
   namespace jit = paddle::platform::jit;
   using namespace paddle::operators::math;  // NOLINT
-  for (auto sz : {1, 2, 15, 16, 32, 128, 200, 512}) {
+  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>);
     TestAndBench<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>);
     TestAndBench<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>);
@@ -130,7 +130,7 @@ TEST(CpuVecTest, tanh) {
 TEST(CpuVecTest, relu) {
   namespace jit = paddle::platform::jit;
   using namespace paddle::operators::math;  // NOLINT
-  for (auto sz : {1, 2, 15, 16, 32, 128, 200, 512}) {
+  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>);
     TestAndBench<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>);
     TestAndBench<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>);

From c70a3fec3e3b469e381279917deb79b786e6b821 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 24 Aug 2018 11:51:50 +0800
Subject: [PATCH 40/62] fix redefinition of argument machine

---
 cmake/configure.cmake | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 7e5d8a76217f1..e03e15bfc017c 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -50,20 +50,14 @@ if(NOT WITH_PROFILER)
 endif(NOT WITH_PROFILER)
 
 if(NOT CMAKE_CROSSCOMPILING)
-    set(SIMD_FLAG)
-    if(WITH_AVX)
-        if (AVX512F_FOUND)
-            set(SIMD_FLAG "${SIMD_FLAG} ${AVX512F_FLAG}")
-        endif()
-        if (AVX2_FOUND)
-            set(SIMD_FLAG "${SIMD_FLAG} ${AVX2_FLAG}")
-        endif()
-        if (AVX_FOUND)
-            set(SIMD_FLAG "${SIMD_FLAG} ${AVX_FLAG}")
-        endif()
-        if (SSE3_FOUND)
-            set(SIMD_FLAG "${SIMD_FLAG} ${SSE3_FLAG}")
-        endif()
+    if(WITH_AVX AND AVX512F_FOUND)
+        set(SIMD_FLAG ${AVX512F_FLAG})
+    elseif(WITH_AVX AND AVX2_FOUND)
+        set(SIMD_FLAG ${AVX2_FLAG})
+    elseif(WITH_AVX AND AVX_FOUND)
+        set(SIMD_FLAG ${AVX_FLAG})
+    elseif(SSE3_FOUND)
+        set(SIMD_FLAG ${SSE3_FLAG})
     endif()
 endif()
 

From fff6f595ff502d71c29dd1b5824f1d2940cd6069 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 24 Aug 2018 12:31:29 +0800
Subject: [PATCH 41/62] add unit test

---
 .../paddle/fluid/tests/unittests/test_fill_constant_op.py   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index b73711b19d613..537cabd5d09b7 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -62,8 +62,10 @@ def check_with_place(self, place):
         fill_constant_op.run(scope, place)
 
         # get result from Out
-        result_array = np.array(out)
-        self.assertEqual(result_array, np.full((123, 92), 3.8))
+        result_array = np.array(out.get_tensor())
+        full_array = np.full((123, 92), 3.8, 'float32')
+
+        self.assertTrue(np.array_equal(result_array, full_array))
 
     def test_fill_constant_with_selected_rows(self):
         places = [core.CPUPlace()]

From 66cc1850a8e29858776fe31e4dc00e5dab49f2be Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 24 Aug 2018 12:50:10 +0800
Subject: [PATCH 42/62] add gpu place

---
 python/paddle/fluid/tests/unittests/test_fill_constant_op.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 537cabd5d09b7..fd59c5bb7cff5 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -69,7 +69,9 @@ def check_with_place(self, place):
 
     def test_fill_constant_with_selected_rows(self):
         places = [core.CPUPlace()]
-        # currently only support CPU
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+
         for place in places:
             self.check_with_place(place)
 

From 2b4edacca0d8756665dce87402043bb5f7ca26c6 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Fri, 24 Aug 2018 13:14:35 +0800
Subject: [PATCH 43/62] enhance the forward of concat op

---
 paddle/fluid/operators/math/concat.cc | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat.cc
index fbe7c29783854..c3c5c160db358 100644
--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
@@ -48,16 +48,16 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
     auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
 
     // computation
-    for (int k = 0; k < out_rows; ++k) {
-      T* dst_ptr = output->data<T>() + k * out_cols;
-      int col_idx = 0;
-      for (int j = 0; j < num; ++j) {
-        int col_len = input_cols[j];
-        const T* src_prt = input[j].data<T>() + k * col_len;
-        memory::Copy(cpu_place, dst_ptr + col_idx, cpu_place, src_prt,
-                     sizeof(T) * col_len);
-        col_idx += col_len;
+    auto output_data = output->data<T>();
+    int col_idx = 0;
+    for (int j = 0; j < num; ++j) {
+      int col_len = input_cols[j];
+      auto input_data = input[j].data<T>();
+      for (int k = 0; k < out_rows; ++k) {
+        memory::Copy(cpu_place, output_data + k * out_cols + col_idx, cpu_place,
+                     input_data + k * col_len, sizeof(T) * col_len);
       }
+      col_idx += col_len;
     }
   }
 };

From f269614bcde3a7526dc164cb5ca9691a605709de Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 24 Aug 2018 12:59:16 +0800
Subject: [PATCH 44/62] further optimize tanh with avx and mkl

---
 paddle/fluid/operators/math/cpu_vec.h | 175 +++++++++++++-------------
 1 file changed, 90 insertions(+), 85 deletions(-)

diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index 52f072eb0e01e..d5f247e7ef6d2 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -45,6 +45,13 @@ inline void vec_exp(const int n, const T* x, T* y) {
   }
 }
 
+template <typename T>
+inline void vec_scal(const int n, const T a, T* x) {
+  for (int i = 0; i < n; ++i) {
+    x[i] = a * x[i];
+  }
+}
+
 #ifdef PADDLE_WITH_MKLML
 template <>
 inline void vec_exp<float>(const int n, const float* x, float* y) {
@@ -55,7 +62,74 @@ template <>
 inline void vec_exp<double>(const int n, const double* x, double* y) {
   platform::dynload::vdExp(n, x, y);
 }
+
+template <>
+inline void vec_scal<float>(const int n, const float a, float* x) {
+  platform::dynload::cblas_sscal(n, a, x, 1);
+}
+
+template <>
+inline void vec_scal<double>(const int n, const double a, double* x) {
+  platform::dynload::cblas_dscal(n, a, x, 1);
+}
+#endif
+
+// MKL scal only support inplace, choose this if src and dst are not equal
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_scal(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a * x[i];
+  }
+}
+
+template <>
+inline void vec_scal<float, platform::jit::avx>(const int n, const float a,
+                                                const float* x, float* y) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block * 4) {  // use larger threshold, since small ones has no boost
+    vec_scal<float, platform::jit::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 scalar = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP               \
+  tmp = _mm256_loadu_ps(x + i);     \
+  tmp = _mm256_mul_ps(tmp, scalar); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = a * x[i];
+  }
+#else
+  vec_scal<float, platform::jit::isa_any>(n, a, x, y);
 #endif
+}
+
+template <>
+inline void vec_scal<float, platform::jit::avx2>(const int n, const float a,
+                                                 const float* x, float* y) {
+  vec_scal<float, platform::jit::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_scal<float, platform::jit::avx512_common>(const int n,
+                                                          const float a,
+                                                          const float* x,
+                                                          float* y) {
+  // TODO(TJ): enable me
+  vec_scal<float, platform::jit::avx2>(n, a, x, y);
+}
 
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 inline void vec_identity(const int n, const T* x, T* y) {
@@ -82,7 +156,7 @@ inline void vec_sigmoid<float, platform::jit::avx>(const int n, const float* x,
                                                    float* y) {
 #ifdef __AVX__
   constexpr int block = AVX_FLOAT_BLOCK;
-  if (n < block) {  // can use larger threshold if necessary
+  if (n < block) {
     vec_sigmoid<float, platform::jit::isa_any>(n, x, y);
     return;
   }
@@ -102,11 +176,15 @@ inline void vec_sigmoid<float, platform::jit::avx>(const int n, const float* x,
   for (i = 0; i < end; i += block) {
     MOVE_ONE_STEP;
   }
+#undef MOVE_ONE_STEP
   if (rest != 0) {
-    i = n - block;
-    MOVE_ONE_STEP;
+    // can not continue move step since the src and dst address could be equal
+    const float xmin = SIGMOID_THRESHOLD_MIN;
+    const float xmax = SIGMOID_THRESHOLD_MAX;
+    for (i = n - rest; i < n; ++i) {
+      y[i] = 0.f - ((x[i] < xmin) ? xmin : ((x[i] > xmax) ? xmax : x[i]));
+    }
   }
-#undef MOVE_ONE_STEP
 
   vec_exp<float>(n, y, y);
 
@@ -142,65 +220,17 @@ template <>
 inline void vec_sigmoid<float, platform::jit::avx512_common>(const int n,
                                                              const float* x,
                                                              float* y) {
-#ifdef __AVX512F__
-  constexpr int block = AVX512_FLOAT_BLOCK;
-  if (n < block) {
-    vec_sigmoid<float, platform::jit::isa_any>(n, x, y);
-    return;
-  }
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m512 max = _mm512_set1_ps(SIGMOID_THRESHOLD_MAX);
-  __m512 min = _mm512_set1_ps(SIGMOID_THRESHOLD_MIN);
-  __m512 zeros = _mm512_setzero_ps();
-  __m512 tmp;
-#define MOVE_ONE_STEP              \
-  tmp = _mm512_loadu_ps(x + i);    \
-  tmp = _mm512_max_ps(tmp, min);   \
-  tmp = _mm512_min_ps(tmp, max);   \
-  tmp = _mm512_sub_ps(zeros, tmp); \
-  _mm512_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-  if (rest != 0) {
-    i = n - block;
-    MOVE_ONE_STEP;
-  }
-#undef MOVE_ONE_STEP
-
-  vec_exp<float>(n, y, y);
-
-  __m512 ones = _mm512_set1_ps(1.0f);
-#define MOVE_ONE_STEP             \
-  tmp = _mm512_loadu_ps(y + i);   \
-  tmp = _mm512_add_ps(ones, tmp); \
-  tmp = _mm512_div_ps(ones, tmp); \
-  _mm512_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-#undef MOVE_ONE_STEP
-  if (rest == 0) {
-    return;
-  }
-  for (i = n - rest; i < n; ++i) {
-    y[i] = 1.f / (1.f + y[i]);
-  }
-#else
-  vec_sigmoid<float, platform::jit::isa_any>(n, x, y);
-#endif
+  // TODO(TJ): enable me
+  vec_sigmoid<float, platform::jit::avx2>(n, x, y);
 }
 
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 inline void vec_tanh(const int n, const T* x, T* y) {
+  vec_scal<T, isa>(n, static_cast<T>(2), x, y);
+  vec_sigmoid<T, isa>(n, y, y);
+  vec_scal<T>(n, static_cast<T>(2), y);
   for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * x[i];
-  }
-  vec_sigmoid<T>(n, y, y);
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
+    y[i] = y[i] - static_cast<T>(1);
   }
 }
 
@@ -255,35 +285,10 @@ template <>
 inline void vec_relu<float, platform::jit::avx512_common>(const int n,
                                                           const float* x,
                                                           float* y) {
-#ifdef __AVX512F__
-  // test me
-  constexpr int block = AVX512_FLOAT_BLOCK;
-  if (n < block) {
-    vec_relu<float, platform::jit::avx2>(n, x, y);
-    return;
-  }
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m512 zeros = _mm512_setzero_ps();
-  __m512 tmp;
-#define MOVE_ONE_STEP              \
-  tmp = _mm512_loadu_ps(x + i);    \
-  tmp = _mm512_max_ps(tmp, zeros); \
-  _mm512_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-  if (rest == 0) {
-    return;
-  }
-  i = n - block;
-  MOVE_ONE_STEP;
-#undef MOVE_ONE_STEP
-#else
+  // TODO(TJ): enable me
   vec_relu<float, platform::jit::avx2>(n, x, y);
-#endif
 }
+// TODO(TJ): add vec add bias, make relu clip
 
 // TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
 

From bb9f98e10d0d138119070af17ab74cec7e94244d Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 24 Aug 2018 14:04:49 +0800
Subject: [PATCH 45/62] add inplace test

---
 paddle/fluid/operators/math/cpu_vec_test.cc | 61 +++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc
index 8b0e9c086a570..bf6481c5ccd76 100644
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <sys/time.h>
 #include <cmath>
+#include <cstring>
 #include <vector>
 #include "gflags/gflags.h"
 #include "glog/logging.h"
@@ -139,3 +140,63 @@ TEST(CpuVecTest, relu) {
   }
   TestAndBench<double>(30, vec_relu<double>, ref_relu<double>);
 }
+
+template <typename T>
+void TestInplace(const int n, std::function<void(const int, const T*, T*)> tgt,
+                 std::function<void(const int, const T*, T*)> ref) {
+  std::vector<T> x(n);
+  std::vector<T> ytgt(n), yref(n);
+  RandomVec<T>(n, x.data());
+
+  const T* x_data = x.data();
+  T* yref_data = yref.data();
+  T* ytgt_data = ytgt.data();
+  std::memcpy(yref_data, x_data, sizeof(T) * n);
+  std::memcpy(ytgt_data, x_data, sizeof(T) * n);
+
+  ref(n, yref_data, yref_data);
+  tgt(n, ytgt_data, ytgt_data);
+
+  for (int i = 0; i < n; ++i) {
+    EXPECT_NEAR(ytgt_data[i], yref_data[i], 1e-3);
+  }
+}
+
+TEST(CpuVecTest, inplace_sigmoid) {
+  namespace jit = paddle::platform::jit;
+  using namespace paddle::operators::math;  // NOLINT
+  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
+    TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
+    TestInplace<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>);
+    TestInplace<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>);
+    TestInplace<float>(sz, vec_sigmoid<float, jit::avx512_common>,
+                       ref_sigmoid<float>);
+  }
+  TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
+}
+
+TEST(CpuVecTest, inplace_tanh) {
+  namespace jit = paddle::platform::jit;
+  using namespace paddle::operators::math;  // NOLINT
+  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
+    TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>);
+    TestInplace<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>);
+    TestInplace<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>);
+    TestInplace<float>(sz, vec_tanh<float, jit::avx512_common>,
+                       ref_tanh<float>);
+  }
+  TestInplace<double>(30, vec_tanh<double>, ref_tanh<double>);
+}
+
+TEST(CpuVecTest, inplace_relu) {
+  namespace jit = paddle::platform::jit;
+  using namespace paddle::operators::math;  // NOLINT
+  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
+    TestInplace<float>(sz, vec_relu<float>, ref_relu<float>);
+    TestInplace<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>);
+    TestInplace<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>);
+    TestInplace<float>(sz, vec_relu<float, jit::avx512_common>,
+                       ref_relu<float>);
+  }
+  TestInplace<double>(30, vec_relu<double>, ref_relu<double>);
+}

From 786558fc680622844d45ac7ea75d899898f95b3b Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Fri, 24 Aug 2018 06:40:13 +0000
Subject: [PATCH 46/62] fix bug to avoid warning once import paddle.fluid

---
 python/paddle/dataset/image.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
index 1cd50bd180209..b32736ee7c265 100644
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -36,11 +36,6 @@
 try:
     import cv2
 except ImportError:
-    import sys
-    sys.stderr.write(
-        '''Warning with paddle image module: opencv-python should be imported,
-    or paddle image module could NOT work; please install opencv-python first.'''
-    )
     cv2 = None
 import os
 import tarfile
@@ -53,6 +48,18 @@
 ]
 
 
+def _check_cv2():
+    if cv2 is None:
+        import sys
+        sys.stderr.write(
+            '''Warning with paddle image module: opencv-python should be imported,
+         or paddle image module could NOT work; please install opencv-python first.'''
+        )
+        return False
+    else:
+        return True
+
+
 def batch_images_from_tar(data_file,
                           dataset_name,
                           img2label,
@@ -134,7 +141,7 @@ def load_image_bytes(bytes, is_color=True):
                      load and return a gray image.
     :type is_color: bool
     """
-    assert cv2 is not None
+    assert _check_cv2() is True
 
     flag = 1 if is_color else 0
     file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
@@ -159,7 +166,7 @@ def load_image(file, is_color=True):
                      load and return a gray image.
     :type is_color: bool
     """
-    assert cv2 is not None
+    assert _check_cv2() is True
 
     # cv2.IMAGE_COLOR for OpenCV3
     # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
@@ -188,7 +195,7 @@ def resize_short(im, size):
     :param size: the shorter edge size of image after resizing.
     :type size: int
     """
-    assert cv2 is not None
+    assert _check_cv2() is True
 
     h, w = im.shape[:2]
     h_new, w_new = size, size

From 3462c29940ccf4e60f56f430757655d9c9676200 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 24 Aug 2018 14:53:45 +0800
Subject: [PATCH 47/62] refine add bias with avx

---
 paddle/fluid/operators/attention_lstm_op.cc | 30 +++-------
 paddle/fluid/operators/math/cpu_vec.h       | 66 +++++++++++++++++++--
 2 files changed, 69 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 1cb65346ee2b7..a73ea09f1e120 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -232,40 +232,28 @@ use lstm_x_t as input and compute as standard LSTM.
 template <typename T>
 inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
   if (bias) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = x[i] + bias[0];
-    }
-    math::vec_relu<T>(n, y, y);
+    math::vec_add_bias<T, platform::jit::avx>(n, *bias, x, y);
+    math::vec_relu<T, platform::jit::avx>(n, y, y);
   } else {
-    math::vec_relu<T>(n, x, y);
+    math::vec_relu<T, platform::jit::avx>(n, x, y);
   }
 }
 
-template <typename DeviceContext, typename T>
-inline void vec_softmax(const math::BlasT<DeviceContext, T>& blas, const int n,
-                        const T* x, T* y) {
+template <typename T>
+inline void vec_softmax(const int n, const T* x, T* y) {
   T scalar = x[0];
   // max
   for (int i = 1; i < n; ++i) {
     scalar = scalar < x[i] ? x[i] : scalar;
   }
-
-  // sub
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] - scalar;
-  }
-
-  // exp
-  blas.VEXP(n, y, y);
-
+  math::vec_add_bias<T, platform::jit::avx>(n, -scalar, x, y);  // sub
+  math::vec_exp<T>(n, y, y);                                    // exp
   // sum
   scalar = T(0);
   for (int i = 0; i < n; ++i) {
     scalar += y[i];
   }
-
-  // scale
-  blas.SCAL(n, static_cast<T>(1) / scalar, y);
+  math::vec_scal<T>(n, static_cast<T>(1) / scalar, y);  // scale
 }
 
 template <typename T>
@@ -363,7 +351,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
                        fc_out_data);
         }
         // 1d. softmax
-        vec_softmax<DeviceContext, T>(blas, seq_len, fc_out_data, fc_out_data);
+        vec_softmax<T>(seq_len, fc_out_data, fc_out_data);
         // mul x(seq_len*M) and sum pool
         math::FCCompute<DeviceContext, T>(blas, 1, M, seq_len, fc_out_data,
                                           cur_x_data, lstm_x_data);
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index d5f247e7ef6d2..0bae926e98929 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -87,7 +87,7 @@ inline void vec_scal<float, platform::jit::avx>(const int n, const float a,
                                                 const float* x, float* y) {
 #ifdef __AVX__
   constexpr int block = AVX_FLOAT_BLOCK;
-  if (n < block * 4) {  // use larger threshold, since small ones has no boost
+  if (n < block) {
     vec_scal<float, platform::jit::isa_any>(n, a, x, y);
     return;
   }
@@ -131,6 +131,62 @@ inline void vec_scal<float, platform::jit::avx512_common>(const int n,
   vec_scal<float, platform::jit::avx2>(n, a, x, y);
 }
 
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+}
+
+template <>
+inline void vec_add_bias<float, platform::jit::avx>(const int n, const float a,
+                                                    const float* x, float* y) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_add_bias<float, platform::jit::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(x + i);   \
+  tmp = _mm256_add_ps(tmp, bias); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+#else
+  vec_add_bias<float, platform::jit::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_add_bias<float, platform::jit::avx2>(const int n, const float a,
+                                                     const float* x, float* y) {
+  vec_add_bias<float, platform::jit::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_add_bias<float, platform::jit::avx512_common>(const int n,
+                                                              const float a,
+                                                              const float* x,
+                                                              float* y) {
+  // TODO(TJ): enable me
+  vec_add_bias<float, platform::jit::avx2>(n, a, x, y);
+}
+
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 inline void vec_identity(const int n, const T* x, T* y) {
   // do nothing
@@ -229,11 +285,10 @@ inline void vec_tanh(const int n, const T* x, T* y) {
   vec_scal<T, isa>(n, static_cast<T>(2), x, y);
   vec_sigmoid<T, isa>(n, y, y);
   vec_scal<T>(n, static_cast<T>(2), y);
-  for (int i = 0; i < n; ++i) {
-    y[i] = y[i] - static_cast<T>(1);
-  }
+  vec_add_bias<T, isa>(n, static_cast<T>(-1), y, y);
 }
 
+// TODO(TJ): make relu clip
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 inline void vec_relu(const int n, const T* x, T* y) {
   for (int i = 0; i < n; ++i) {
@@ -246,7 +301,7 @@ inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
                                                 float* y) {
 #ifdef __AVX__
   constexpr int block = AVX_FLOAT_BLOCK;
-  if (n < block) {
+  if (n < block * 4) {
     vec_relu<float, platform::jit::isa_any>(n, x, y);
     return;
   }
@@ -288,7 +343,6 @@ inline void vec_relu<float, platform::jit::avx512_common>(const int n,
   // TODO(TJ): enable me
   vec_relu<float, platform::jit::avx2>(n, x, y);
 }
-// TODO(TJ): add vec add bias, make relu clip
 
 // TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
 

From ba943d38e38b96b527114b70a37321af665a5062 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 24 Aug 2018 15:07:05 +0800
Subject: [PATCH 48/62] make runtime avx act

---
 paddle/fluid/operators/attention_lstm_op.cc | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index a73ea09f1e120..8bab37c5830df 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -299,11 +299,21 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D);
     fc_out->Resize({max_seq_len, 1});
 
-    math::VecActivations<T> act_functor;
     std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand;
-    act_gate = act_functor(ctx.Attr<std::string>("gate_activation"));
-    act_cell = act_functor(ctx.Attr<std::string>("cell_activation"));
-    act_cand = act_functor(ctx.Attr<std::string>("candidate_activation"));
+    auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
+    auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
+    auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
+    if (platform::jit::MayIUse(platform::jit::avx)) {
+      math::VecActivations<T, platform::jit::avx> act_functor;
+      act_gate = act_functor(act_gate_str);
+      act_cell = act_functor(act_cell_str);
+      act_cand = act_functor(act_cand_str);
+    } else {
+      math::VecActivations<T, platform::jit::isa_any> act_functor;
+      act_gate = act_functor(act_gate_str);
+      act_cell = act_functor(act_cell_str);
+      act_cand = act_functor(act_cand_str);
+    }
 
     const T* x_data = x->data<T>();
     const T* h0_data = h0 ? h0->data<T>() : NULL;

From 3d06ccfb23f45994253cd229ddeab0e7b36e0a15 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Fri, 24 Aug 2018 17:07:25 +0800
Subject: [PATCH 49/62] update native_infer.rst

---
 doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst b/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
index 3571f81326a9f..aa9377c112856 100644
--- a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
@@ -9,8 +9,6 @@ Paddle 预测 API
 
 -  头文件 ``paddle_inference_api.h`` 定义了所有的接口
 -  库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a``
--  库文件 ``libpaddle_inference_api.so`` 或
-   ``libpaddle_inference_api.a``
 
 编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。
 
@@ -97,8 +95,7 @@ engine
     CHECK(predictor->Run(slots, &outputs));
     // 获取 outputs ...
 
-编译时，联编 ``libpaddle_fluid.a/.so`` 和
-``libpaddle_inference_api.a/.so`` 便可。
+编译时，联编 ``libpaddle_fluid.a/.so`` 即可。
 
 详细代码参考
 ------------

From 3b38e5a4fc5be2740762d9ff7a8ff8b5b7d5e930 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 24 Aug 2018 10:04:18 +0000
Subject: [PATCH 50/62] speed up stack_op

---
 paddle/fluid/operators/stack_op.h | 56 ++++++++-----------------------
 1 file changed, 14 insertions(+), 42 deletions(-)

diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h
index c777d5feaec1c..d236c5b943704 100644
--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
@@ -150,30 +150,17 @@ class StackKernel : public framework::OpKernel<T> {
     int total_num = pre * n * post;
 
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    constexpr auto kMaxThreshold = 16;
-    if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value ||
-        n > kMaxThreshold) {
 #ifdef __NVCC__
-      VLOG(10) << "Stack more than " << kMaxThreshold
-               << " tensors on GPU may be slow.";
-      thrust::device_vector<const T *> device_x_vec(x_datas);
-      auto x_data_arr = device_x_vec.data().get();
+    thrust::device_vector<const T *> device_x_vec(x_datas);
+    auto x_data_arr = device_x_vec.data().get();
 #else
-      auto x_data_arr = x_datas.data();
+    auto x_data_arr = x_datas.data();
 #endif
-      StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
+    StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
 #ifdef __NVCC__
-      // Wait() must be called because device_x_vec may be destructed before
-      // kernel ends
-      dev_ctx.Wait();
-#endif
-    }
-#ifdef __NVCC__
-    else {  // NOLINT
-      framework::Array<const T *, kMaxThreshold> x_data_arr;
-      for (int i = 0; i < n; ++i) x_data_arr[i] = x_datas[i];
-      StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
-    }
+    // Wait() must be called because device_x_vec may be destructed before
+    // kernel ends
+    dev_ctx.Wait();
 #endif
   }
 };
@@ -244,32 +231,17 @@ class StackGradKernel : public framework::OpKernel<T> {
     int post = total_num / (n * pre);
 
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    constexpr auto kMaxThreshold = 16;
-    if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value ||
-        n > kMaxThreshold) {
 #ifdef __NVCC__
-      VLOG(10) << "Stack more than " << kMaxThreshold
-               << " tensors on GPU may be slow.";
-      thrust::device_vector<T *> device_dx_vec(dx_datas);
-      auto dx_data_arr = device_dx_vec.data().get();
+    thrust::device_vector<T *> device_dx_vec(dx_datas);
+    auto dx_data_arr = device_dx_vec.data().get();
 #else
-      auto dx_data_arr = dx_datas.data();
+    auto dx_data_arr = dx_datas.data();
 #endif
-      StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n,
-                               post);
+    StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post);
 #ifdef __NVCC__
-      // Wait() must be called because device_dx_vec may be destructed before
-      // kernel ends
-      dev_ctx.Wait();
-#endif
-    }
-#ifdef __NVCC__
-    else {  // NOLINT
-      framework::Array<T *, kMaxThreshold> dx_data_arr;
-      for (int i = 0; i < n; ++i) dx_data_arr[i] = dx_datas[i];
-      StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n,
-                               post);
-    }
+    // Wait() must be called because device_dx_vec may be destructed before
+    // kernel ends
+    dev_ctx.Wait();
 #endif
   }
 };

From 593ac0f23ec6c9beac85791a37d29228414903db Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Sat, 25 Aug 2018 11:05:39 +0800
Subject: [PATCH 51/62] openblas (#12937)

---
 cmake/external/glog.cmake     |  7 +++++++
 cmake/external/openblas.cmake | 19 +++++++++++++++----
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index ac0181e69cbf5..25ef2970ac52f 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -60,6 +60,13 @@ ExternalProject_Add(
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
+IF(WIN32)
+  IF(NOT EXISTS "${GLOG_INSTALL_DIR}/lib/libglog.lib")
+    add_custom_command(TARGET extern_glog POST_BUILD
+    COMMAND cmake -E rename ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib
+  )
+  ENDIF()
+ENDIF(WIN32)
 
 ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 56024edf5be09..c3fbe4dbdb28f 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -17,20 +17,29 @@ IF(USE_EIGEN_FOR_BLAS)
 ENDIF(USE_EIGEN_FOR_BLAS)
 
 INCLUDE(cblas)
+# IF(WIN32 AND NOT ${CBLAS_FOUND})
+
+
 
 IF(NOT ${CBLAS_FOUND})
+
     INCLUDE(ExternalProject)
 
     SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
     SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
-    SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
+    SET(CBLAS_INCLUDE_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
 
     SET(CBLAS_LIBRARIES
         "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
         CACHE FILEPATH "openblas library." FORCE)
 
     ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
+    IF (WIN32)
+        SET(CBLAS_FOUND true)
+        MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR})
+    ENDIF(WIN32)
 
+    IF (NOT WIN32)
     SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
     SET(OPENBLAS_COMMIT "v0.2.20")
 
@@ -69,7 +78,6 @@ IF(NOT ${CBLAS_FOUND})
     ENDIF()
 
     SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
-
     ExternalProject_Add(
         extern_openblas
         ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -84,9 +92,11 @@ IF(NOT ${CBLAS_FOUND})
         UPDATE_COMMAND      ""
         CONFIGURE_COMMAND   ""
     )
+    ELSE()
+    ENDIF(NOT WIN32)
     SET(CBLAS_PROVIDER openblas)
     IF(WITH_C_API)
-        INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
+        INSTALL(DIRECTORY ${CBLAS_INCLUDE_DIR} DESTINATION third_party/openblas)
         # Because libopenblas.a is a symbolic link of another library, thus need to
         # install the whole directory.
         IF(ANDROID)
@@ -107,7 +117,8 @@ IF(NOT ${CBLAS_FOUND})
 ENDIF(NOT ${CBLAS_FOUND})
 
 MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
-INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
+MESSAGE(STATUS "BLAS Include: ${CBLAS_INCLUDE_DIR}")
+INCLUDE_DIRECTORIES(${CBLAS_INCLUDE_DIR})
 
 # FIXME(gangliao): generate cblas target to track all high performance
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)

From 5df65811010162743959090d7a80e557d9594178 Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Sat, 25 Aug 2018 11:05:49 +0800
Subject: [PATCH 52/62] merge_static_libs (#12936)

---
 cmake/generic.cmake       | 38 ++++++++++++++++++++++++++++++++++++--
 cmake/inference_lib.cmake |  9 +++++++++
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 82c958073cba9..6d230942321f8 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -148,7 +148,8 @@ function(merge_static_libs TARGET_NAME)
       COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
       COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
       )
-  else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+  endif(APPLE)
+  if(LINUX) # general UNIX: use "ar" to extract objects and re-add to a common lib
     set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir)
 
     foreach(lib ${libs})
@@ -187,7 +188,36 @@ function(merge_static_libs TARGET_NAME)
         COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'`
         COMMAND ${CMAKE_RANLIB} ${target_LIBNAME}
         WORKING_DIRECTORY ${target_DIR})
-  endif()
+  endif(LINUX)
+  if(WIN32) # windows do not support gcc/nvcc combined compiling. Use msvc lib.exe to merge libs.
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(OUTPUT ${target_SRCS}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
+      DEPENDS ${libs})
+
+    # Generate dummy staic lib
+    file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
+
+    foreach(lib ${libs})
+      # Get the file names of the libraries to be merged
+      #if(NOT $<TARGET_FILE:${lib}> MATCHES "lib.*\\.lib")
+      #  message("library" ${lib})
+      #  set(libfiles ${libfiles} lib$<TARGET_FILE:${lib}>)
+      #else()
+      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+      #endif()
+    endforeach()
+   
+    # windows cmd return error in clean env.
+    # COMMAND del "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+      COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.lib ${libfiles}
+      )
+  endif(WIN32)
 endfunction(merge_static_libs)
 
 function(cc_library TARGET_NAME)
@@ -195,6 +225,10 @@ function(cc_library TARGET_NAME)
   set(oneValueArgs "")
   set(multiValueArgs SRCS DEPS)
   cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  if(WIN32)
+      # add libxxx.lib prefix in windows
+      set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
+  endif(WIN32)
   if(cc_library_SRCS)
     if(cc_library_SHARED OR cc_library_shared) # build *.so
       add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 834ab5a9e5273..bc36683a9facc 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -101,6 +101,7 @@ if(WITH_MKLDNN)
   )
 endif()
 
+if (NOT WIN32)
 if(NOT MOBILE_INFERENCE AND NOT RPI)
   set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
   copy(snappy_lib
@@ -120,15 +121,23 @@ if(NOT MOBILE_INFERENCE AND NOT RPI)
     DSTS ${dst_dir} ${dst_dir}/lib
     DEPS zlib)
 endif()
+endif(NOT WIN32)
 
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
 set(module "framework")
+if (NOT WIN32)
 copy(framework_lib DEPS framework_py_proto 
   SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
   DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
 )
+else()
+copy(framework_lib
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
+)
+endif(NOT WIN32)
 
 set(module "memory")
 copy(memory_lib

From 669304f4e5005c9dd4763a86a2f91773d68941be Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Sat, 25 Aug 2018 11:06:00 +0800
Subject: [PATCH 53/62] protobuf (#12935)

---
 cmake/external/protobuf.cmake | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 2665996432b1f..550b0dada8e90 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -14,11 +14,14 @@
 
 INCLUDE(ExternalProject)
 # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
+IF(NOT WIN32)
 FIND_PACKAGE(Protobuf QUIET)
+ENDIF(NOT WIN32)
 macro(UNSET_VAR VAR_NAME)
     UNSET(${VAR_NAME} CACHE)
     UNSET(${VAR_NAME})
 endmacro()
+
 UNSET_VAR(PROTOBUF_INCLUDE_DIR)
 UNSET_VAR(PROTOBUF_FOUND)
 UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
@@ -94,12 +97,14 @@ macro(PROMPT_PROTOBUF_LIB)
     SET(protobuf_DEPS ${ARGN})
 
     MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}")
+    MESSAGE(STATUS "Protobuf-lite library: ${PROTOBUF_LITE_LIBRARY}")
     MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}")
+    MESSAGE(STATUS "Protoc library: ${PROTOBUF_PROTOC_LIBRARY}")
     MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}")
     INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
 
     # Assuming that all the protobuf libraries are of the same type.
-    IF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_STATIC_LIBRARY_SUFFIX}$")
+    IF(${PROTOBUF_LIBRARY} MATCHES ${CMAKE_STATIC_LIBRARY_SUFFIX})
         SET(protobuf_LIBTYPE STATIC)
     ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$")
         SET(protobuf_LIBTYPE SHARED)
@@ -137,18 +142,25 @@ macro(SET_PROTOBUF_VERSION)
 endmacro()
 
 set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
+IF (WIN32)
+    SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf)
+    MESSAGE(WARNING, "In windows, protobuf only support msvc build, please build it manually and put it at " ${PROTOBUF_ROOT})
+ENDIF(WIN32)
+
 if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
+
     find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
-    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LIBRARY protobuf libprotobuf.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite libprotobuf-lite.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_PROTOC_LIBRARY protoc libprotoc.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
     find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
     if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
         message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
+        SET(PROTOBUF_FOUND true)
         SET_PROTOBUF_VERSION()
         PROMPT_PROTOBUF_LIB()
     else()
-        message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}.")
+        message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}")
     endif()
 endif()
 
@@ -239,6 +251,7 @@ IF(CMAKE_CROSSCOMPILING)
         CACHE FILEPATH "protobuf executable." FORCE)
 ENDIF()
 
+
 IF(NOT PROTOBUF_FOUND)
     build_protobuf(extern_protobuf FALSE)
 

From a4ffdf3088daaef939eab72db0c96473db8e2621 Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Sat, 25 Aug 2018 11:25:45 +0800
Subject: [PATCH 54/62] gflags (#12928)

---
 cmake/external/gflags.cmake | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index a1d2d0f44685c..cf58cc3976235 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -18,7 +18,7 @@ SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags)
 SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
 SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
 IF(WIN32)
-  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
 ELSE(WIN32)
   set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
 ENDIF(WIN32)
@@ -45,7 +45,13 @@ ExternalProject_Add(
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
-
+IF(WIN32)
+  IF(NOT EXISTS "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib")
+    add_custom_command(TARGET extern_gflags POST_BUILD
+    COMMAND cmake -E rename ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib
+  )
+  ENDIF()
+ENDIF(WIN32)
 ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
 ADD_DEPENDENCIES(gflags extern_gflags)
@@ -60,3 +66,4 @@ IF(WITH_C_API)
     INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib)
   ENDIF()
 ENDIF()
+

From eca4563e5dfd949d4ee8c945494a5f25412dae17 Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Sat, 25 Aug 2018 11:37:46 +0800
Subject: [PATCH 55/62] operators module (#12938)

---
 paddle/fluid/operators/CMakeLists.txt       |  5 +++--
 paddle/fluid/operators/math/math_function.h |  4 ++++
 paddle/fluid/platform/float16.h             |  4 ++++
 paddle/fluid/pybind/CMakeLists.txt          | 12 +++++++-----
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 68fbde2c09fd9..8da0aaaafeb15 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -85,7 +85,7 @@ function(op_library TARGET)
 
     #remove windows unsupported op
     if (WIN32)
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op")
         if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
           return()
         endif()
@@ -319,8 +319,9 @@ foreach(src ${GENERAL_OPS})
 endforeach()
 
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
-
+if (NOT WIN32)
 add_subdirectory(reader)
+endif(NOT WIN32)
 foreach(src ${READER_LIBRARY})
     set(OP_LIBRARY ${src} ${OP_LIBRARY})
 endforeach()
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
index 7ec78d9ef8e7f..c63ad89e46d2c 100644
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -19,6 +19,10 @@ limitations under the License. */
 
 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
+// remove typedef in openblas
+#undef FLOAT
+#undef INT
+#undef SIZE
 #endif
 
 #include <cmath>
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index efb021c838e36..ee16fc66e4aa7 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -56,7 +56,11 @@ limitations under the License. */
 #include <immintrin.h>
 #endif  // PADDLE_ARM
 
+#if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+#else
+#define PADDLE_ALIGN(x) /*do nothing*/
+#endif
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index d6a14b3305c5c..b5bd07d401f9e 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,17 +1,19 @@
-set(PYBIND_DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
-          )
+
+set(PYBIND_DEPS pybind python proto_desc memory executor prune  feed_fetch_method)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc)
 if(NOT WIN32)
-list(APPEND PYBIND_DEPS parallel_executor)
+list(APPEND PYBIND_DEPS parallel_executor profiler)
+list(APPEND PYBIND_SRCS recordio.cc)
 endif()
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
-      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
+      SRCS ${PYBIND_SRCS}
       DEPS ${PYBIND_DEPS}
       ${GLOB_OP_LIB})
   else()
     cc_library(paddle_pybind SHARED
-      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
+      SRCS ${PYBIND_SRCS}
       DEPS ${PYBIND_DEPS}
       ${GLOB_OP_LIB})
     if(NOT APPLE AND NOT ANDROID AND NOT WIN32)

From d0b713493eebd79c8bc6c40a8d55f6f31bad4021 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sat, 25 Aug 2018 13:30:22 +0800
Subject: [PATCH 56/62] enhance DebugStringEx (#12949)

---
 paddle/fluid/framework/operator.cc | 52 +++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index d04f7744961b2..d58d6e4f3e684 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -74,6 +74,12 @@ static DDim GetDims(const Scope& scope, const std::string& name,
   }
 }
 
+static bool VarInited(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  if (var == nullptr) return false;
+  return var->IsInitialized();
+}
+
 static std::string GetDtype(const Scope& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
@@ -87,8 +93,12 @@ static std::string GetDtype(const Scope& scope, const std::string& name) {
     }
     return DataTypeToString(ToDataType(tensor.type()));
   } else if (var->IsType<SelectedRows>()) {
-    return DataTypeToString(
-        ToDataType(var->Get<SelectedRows>().value().type()));
+    auto tensor = var->Get<SelectedRows>().value();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return "uninited";
+    } else {
+      return DataTypeToString(ToDataType(tensor.type()));
+    }
   } else {
     return "";
   }
@@ -197,16 +207,21 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
     auto& input = *it;
     ss << input.first << "[";
     for (size_t i = 0; i < input.second.size(); ++i) {
-      ss << input.second[i];
+      auto var_name = input.second[i];
+      ss << var_name;
       if (scope) {
-        int row_size = GetRowSize(*scope, input.second[i]);
-        if (row_size >= 0) {
-          ss << "[row_size=" << row_size << "]";
+        if (!VarInited(*scope, var_name)) {
+          ss << "[uninited]";
+        } else {
+          int row_size = GetRowSize(*scope, var_name);
+          if (row_size >= 0) {
+            ss << "[row_size=" << row_size << "]";
+          }
+          std::string dtype = GetDtype(*scope, var_name);
+          ss << ":" << dtype;
+          ss << "[" << GetDims(*scope, var_name, true) << "]";
+          ss << "(" << GetLoD(*scope, var_name) << ")";
         }
-        std::string dtype = GetDtype(*scope, input.second[i]);
-        ss << ":" << dtype;
-        ss << "[" << GetDims(*scope, input.second[i], true) << "]";
-        ss << "(" << GetLoD(*scope, input.second[i]) << ")";
       }
       if (i != input.second.size() - 1) {
         ss << ", ";
@@ -223,14 +238,19 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
     auto& output = *it;
     ss << output.first << "[";
     for (size_t i = 0; i < output.second.size(); ++i) {
-      ss << output.second[i];
+      auto var_name = output.second[i];
+      ss << var_name;
       if (scope) {
-        int row_size = GetRowSize(*scope, output.second[i]);
-        if (row_size >= 0) {
-          ss << "[row_size=" << row_size << "]";
+        if (!VarInited(*scope, var_name)) {
+          ss << "[uninited]";
+        } else {
+          int row_size = GetRowSize(*scope, output.second[i]);
+          if (row_size >= 0) {
+            ss << "[row_size=" << row_size << "]";
+          }
+          ss << "[" << GetDims(*scope, var_name, true) << "]";
+          ss << "(" << GetLoD(*scope, var_name) << ")";
         }
-        ss << "[" << GetDims(*scope, output.second[i], true) << "]";
-        ss << "(" << GetLoD(*scope, output.second[i]) << ")";
       }
       if (i != output.second.size() - 1) {
         ss << ", ";

From c790d57cd4ac80610f7e0f3c4ab164e57f74e463 Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Sat, 25 Aug 2018 14:53:27 +0800
Subject: [PATCH 57/62] data_type (#12933)

* data_type

* "remove tabs"
---
 paddle/fluid/CMakeLists.txt           |  6 ++++-
 paddle/fluid/framework/CMakeLists.txt | 21 ++++++++++++++--
 paddle/fluid/framework/data_type.h    | 35 +++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 2577e59d9cf24..ee1f655e25ded 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -2,9 +2,13 @@ add_subdirectory(memory)
 add_subdirectory(platform)
 add_subdirectory(framework)
 add_subdirectory(operators)
-add_subdirectory(pybind)
 add_subdirectory(string)
+
+if (NOT WIN32)
+add_subdirectory(pybind)
 add_subdirectory(recordio)
+endif(NOT WIN32)
+
 if(WITH_INFERENCE)
   # NOTE: please add subdirectory inference at last.
   add_subdirectory(inference)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 2ec422cc17faf..2c62d4ed6b0e6 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -1,5 +1,7 @@
-add_subdirectory(details)
 add_subdirectory(ir)
+if (NOT WIN32)
+add_subdirectory(details)
+endif (NOT WIN32)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
 
@@ -28,8 +30,12 @@ if(WITH_GPU)
 else()
   cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
 endif()
-
+if (NOT WIN32)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
+else()
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
+endif (NOT WIN32)
+
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
 
@@ -69,14 +75,22 @@ cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
+
+if (NOT WIN32)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
     shape_inference data_transform lod_tensor profiler)
+else()
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
+    shape_inference data_transform lod_tensor)
+endif(NOT WIN32)
+
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
+if (NOT WIN32)
 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
@@ -86,6 +100,7 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
     COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
     COMMENT "Copy generated python proto into directory paddle/fluid/proto."
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif(NOT WIN32)
 
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 
@@ -120,7 +135,9 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
 # cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 
+if (NOT WIN32)
 cc_test(rw_lock_test SRCS rw_lock_test.cc)
+endif (NOT WIN32)
 
 # disable test temporarily.
 # TODO https://github.com/PaddlePaddle/Paddle/issues/11971
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 491413db8c8d6..f8c72ffc8964e 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -26,6 +26,7 @@ namespace framework {
 extern proto::VarType::Type ToDataType(std::type_index type);
 extern std::type_index ToTypeIndex(proto::VarType::Type type);
 
+#if !defined(_WIN32)
 template <typename Visitor>
 inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
   switch (type) {
@@ -57,6 +58,40 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
       PADDLE_THROW("Not supported %d", type);
   }
 }
+#else
+// the msvc compiler do not implement two-stage name lookup correctly.
+template <typename Visitor>
+inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
+  switch (type) {
+    case proto::VarType::FP16:
+      visitor.operator()<platform::float16>();
+      break;
+    case proto::VarType::FP32:
+      visitor.operator()<float>();
+      break;
+    case proto::VarType::FP64:
+      visitor.operator()<double>();
+      break;
+    case proto::VarType::INT32:
+      visitor.operator()<int>();
+      break;
+    case proto::VarType::INT64:
+      visitor.operator()<int64_t>();
+      break;
+    case proto::VarType::BOOL:
+      visitor.operator()<bool>();
+      break;
+    case proto::VarType::UINT8:
+      visitor.operator()<uint8_t>();
+      break;
+    case proto::VarType::INT16:
+      visitor.operator()<int16_t>();
+      break;
+    default:
+      PADDLE_THROW("Not supported %d", type);
+  }
+}
+#endif  // _WIN32
 
 extern std::string DataTypeToString(const proto::VarType::Type type);
 extern size_t SizeOfType(std::type_index type);

From 77c0aeb91e8906f6f1cecc1cdb28f1731e4a46c0 Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Sat, 25 Aug 2018 14:55:56 +0800
Subject: [PATCH 58/62] boost (#12929)

* "fix ci"

* "windows tab"

* "fix ci"
---
 cmake/external/boost.cmake | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index 73713d93d5a52..ada61de8eb15a 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -28,7 +28,12 @@ if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL))
     set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
     set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
 endif()
-MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
+IF (WIN32)
+    MESSAGE(WARNING, "In windows, boost can not be downloaded automaticlly, please build it manually and put it at " ${THIRD_PARTY_PATH}install/boost)
+else()
+    MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
+ENDIF(WIN32)
+
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
@@ -36,12 +41,13 @@ set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
 
 include_directories(${BOOST_INCLUDE_DIR})
 
+if (NOT WIN32)
 ExternalProject_Add(
     ${BOOST_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DOWNLOAD_DIR          ${BOOST_DOWNLOAD_DIR}
     DOWNLOAD_COMMAND      wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
-                          && tar zxf ${BOOST_TAR}.tar.gz
+    && tar zxf ${BOOST_TAR}.tar.gz
     DOWNLOAD_NO_PROGRESS  1
     PREFIX                ${BOOST_SOURCES_DIR}
     CONFIGURE_COMMAND     ""
@@ -49,8 +55,9 @@ ExternalProject_Add(
     INSTALL_COMMAND       ""
     UPDATE_COMMAND        ""
 )
+endif(NOT WIN32)
 
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
     set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
     file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
     add_library(boost STATIC ${dummyfile})

From 04b1e4dcea1cb2a590643c464c70167b87ed94d4 Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Sat, 25 Aug 2018 15:38:24 +0800
Subject: [PATCH 59/62] tensor module windows support (#12934)

* tensor windows support

* "fix ci"

* "remove utils"
---
 paddle/fluid/framework/lod_tensor.cc      | 17 ++++++++++++++++-
 paddle/fluid/framework/lod_tensor_test.cc |  2 ++
 paddle/fluid/framework/rw_lock.h          | 12 ++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 919029c38f2f2..adeb26e4e7869 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -25,8 +25,10 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
 
+#if !defined(_WIN32)
 #include "paddle/fluid/recordio/scanner.h"
 #include "paddle/fluid/recordio/writer.h"
+#endif  // _WIN32
 
 namespace paddle {
 namespace framework {
@@ -300,6 +302,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
   TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }
 
+#if !defined(_WIN32)
 void WriteToRecordIO(recordio::Writer *writer,
                      const std::vector<LoDTensor> &tensor,
                      const platform::DeviceContext &dev_ctx) {
@@ -329,7 +332,19 @@ bool ReadFromRecordIO(recordio::Scanner *scanner,
 
   return true;
 }
-
+#else
+class Writer {};
+class Scanner {};
+void WriteToRecordIO(recordio::Writer *writer,
+                     const std::vector<LoDTensor> &tensor,
+                     const platform::DeviceContext &dev_ctx) {}
+bool ReadFromRecordIO(recordio::Scanner *scanner,
+                      const platform::DeviceContext &dev_ctx,
+                      std::vector<LoDTensor> *result_ptr) {
+  PADDLE_ENFORCE("windows didn't supported recordio!.");
+  return true;
+}
+#endif  // _WIN32
 std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
     const std::vector<platform::Place> places) const {
   check_memory_size();
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index cd50aaa26054b..cbf5fd04d7300 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -274,6 +274,7 @@ TEST(LoD, ConvertToOffsetBasedLoD) {
   EXPECT_EQ(offset_lod, expected);
 }
 
+#if !defined(_WIN32)
 template <typename T>
 static void TestRecordIO() {
   LoDTensor tensor;
@@ -320,6 +321,7 @@ TEST(LoDTensor, RecordIO) {
   TestRecordIO<float>();
   TestRecordIO<double>();
 }
+#endif  // !defined(_WIN32)
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h
index 1418fb5134fdd..a068d3543d9d2 100644
--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
@@ -14,13 +14,16 @@ limitations under the License. */
 
 #pragma once
 
+#if !defined(_WIN32)
 #include <pthread.h>
+#endif  // !_WIN32
 
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 
+#if !defined(_WIN32)
 struct RWLock {
   RWLock() { pthread_rwlock_init(&lock_, nullptr); }
 
@@ -43,6 +46,15 @@ struct RWLock {
  private:
   pthread_rwlock_t lock_;
 };
+#else
+// https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
+// In windows, rw_lock seems like a hack. Use empty object and do nothing.
+struct RWLock {
+  void RDLock() {}
+  void WRLock() {}
+  void UNLock() {}
+};
+#endif
 
 }  // namespace framework
 }  // namespace paddle

From dbd7896678ade5a57705477d0f963525e909733c Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Sat, 25 Aug 2018 15:38:42 +0800
Subject: [PATCH 60/62] cmakelist windows (#12927)

* picked pr

* "fix ci"
---
 CMakeLists.txt        | 17 ++++++++++++-----
 cmake/configure.cmake |  5 +++++
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 317f7f9eb46a9..b1d0abdf2ceb4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,9 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
         "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
         "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+if(WIN32)
+    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
+endif(WIN32)
 
 if(NOT CMAKE_CROSSCOMPILING)
     find_package(CUDA QUIET)
@@ -165,7 +168,6 @@ include(external/python)    # download, build, install python
 include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
-include(external/warpctc)   # download, build, install warpctc
 include(external/boost)     # download boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
@@ -173,6 +175,14 @@ include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
 
+if (NOT WIN32)
+# there is no official support of snappystream, warpctc, nccl, cupti in windows
+include(external/snappy)    # download snappy
+include(external/snappystream) # download snappystream
+include(external/warpctc)   # download, build, install warpctc
+include(cupti)
+endif (NOT WIN32)
+
 if(WITH_DISTRIBUTE)
     if(WITH_GRPC)
         include(external/grpc)
@@ -194,13 +204,10 @@ if(WITH_BRPC_RDMA)
     endif()
 endif()
 
-include(external/snappy)    # download snappy
-include(external/snappystream)
-include(external/threadpool)
 
+include(external/threadpool)
 include(flags)              # set paddle compile flags
 include(cudnn)              # set cudnn libraries, must before configure
-include(cupti)
 include(configure)          # add paddle env configuration
 
 if(WITH_GPU)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e03e15bfc017c..ce1857582bd3e 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -61,6 +61,11 @@ if(NOT CMAKE_CROSSCOMPILING)
     endif()
 endif()
 
+if(WIN32)
+  # windows stupid compile option for all targets.
+  add_definitions(-D_XKEYCHECK_H)
+endif(WIN32)
+
 if(NOT WITH_GOLANG)
     add_definitions(-DPADDLE_WITHOUT_GOLANG)
 endif(NOT WITH_GOLANG)

From 3c58b87b45440cf13be778a53c6b2744c1d00e7e Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 26 Aug 2018 10:00:41 +0800
Subject: [PATCH 61/62] fix auc layer and add check for auc op (#12954)

* fix auc layer and add check for auc op

* use input to check if states are inited

* optimize code
---
 paddle/fluid/operators/auc_op.h             | 14 ++++++++++++++
 paddle/fluid/operators/math/cpu_vec_test.cc |  1 +
 python/paddle/fluid/layers/metric_op.py     | 12 ++++++++----
 3 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h
index 0a18585edb54a..0651203286c0f 100644
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -60,6 +60,20 @@ class AucKernel : public framework::OpKernel<T> {
     const T* inference_data = predict->data<T>();
     const auto* label_data = label->data<int64_t>();
 
+    // check if states are inited.
+    auto* tp_in = ctx.Input<Tensor>("TP");
+    auto* fp_in = ctx.Input<Tensor>("FP");
+    auto* tn_in = ctx.Input<Tensor>("TN");
+    auto* fn_in = ctx.Input<Tensor>("FN");
+    PADDLE_ENFORCE(tp_in->IsInitialized(), "true_positive is not inited!");
+    PADDLE_ENFORCE(fp_in->IsInitialized(), "false_negative is not inited!");
+    PADDLE_ENFORCE(tn_in->IsInitialized(), "true_negative is not inited!");
+    PADDLE_ENFORCE(fn_in->IsInitialized(), "false_positive is not inited!");
+    PADDLE_ENFORCE_EQ(tp_in->numel(), num_thresholds, "");
+    PADDLE_ENFORCE_EQ(fp_in->numel(), num_thresholds, "");
+    PADDLE_ENFORCE_EQ(tn_in->numel(), num_thresholds, "");
+    PADDLE_ENFORCE_EQ(fn_in->numel(), num_thresholds, "");
+
     auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace());
     auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace());
     auto* tn_data = true_negative->mutable_data<int64_t>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc
index bf6481c5ccd76..3ce66f49ed835 100644
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <sys/time.h>
 #include <cmath>
 #include <cstring>
+#include <random>
 #include <vector>
 #include "gflags/gflags.h"
 #include "glog/logging.h"
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index 2c3bdd77e1fa1..0182bbeb637ec 100644
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -119,10 +119,14 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
     helper = LayerHelper("auc", **locals())
     auc_out = helper.create_tmp_variable(dtype="float64")
     # make tp, tn, fp, fn persistable, so that can accumulate all batches.
-    tp = helper.create_global_variable(persistable=True, dtype='int64')
-    tn = helper.create_global_variable(persistable=True, dtype='int64')
-    fp = helper.create_global_variable(persistable=True, dtype='int64')
-    fn = helper.create_global_variable(persistable=True, dtype='int64')
+    tp = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[num_thresholds])
+    tn = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[num_thresholds])
+    fp = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[num_thresholds])
+    fn = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[num_thresholds])
     for var in [tp, tn, fp, fn]:
         helper.set_variable_initializer(
             var, Constant(

From 4fcc2936174315969275abb2a0172c72e3b01bbe Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Sun, 26 Aug 2018 16:25:15 +0800
Subject: [PATCH 62/62] memory module (#12931)

* memory module

* "fix ci"
---
 .../inference/api/demo_ci/CMakeLists.txt      |  2 +
 .../fluid/memory/detail/system_allocator.cc   | 49 +++++++++++++------
 2 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index ba73a6eaa6fc8..a697218377e1e 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -23,9 +23,11 @@ include_directories("${PADDLE_LIB}")
 include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
 include_directories("${PADDLE_LIB}/third_party/install/glog/include")
 include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+if (NOT WIN32)
 include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
 include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
 include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
+endif(NOT WIN32)
 
 include_directories("${PADDLE_LIB}/third_party/boost")
 include_directories("${PADDLE_LIB}/third_party/eigen3")
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 9b1ab1e228dd7..1b96798d23cec 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -11,12 +11,18 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#define GLOG_NO_ABBREVIATED_SEVERITIES
 
 #include "paddle/fluid/memory/detail/system_allocator.h"
 
-#include <stdlib.h>    // for malloc and free
+#ifdef _WIN32
+#include <malloc.h>
+#include <windows.h>  // VirtualLock/VirtualUnlock
+#else
 #include <sys/mman.h>  // for mlock and munlock
-#include <algorithm>   // for std::max
+#endif
+#include <stdlib.h>   // for malloc and free
+#include <algorithm>  // for std::max
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/assert.h"
@@ -35,31 +41,42 @@ namespace paddle {
 namespace memory {
 namespace detail {
 
-void* CPUAllocator::Alloc(size_t* index, size_t size) {
-  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
-  // malloc might not return nullptr if size is zero, but the returned
-  // pointer shall not be dereferenced -- so we make it nullptr.
-  if (size <= 0) return nullptr;
-
-  *index = 0;  // unlock memory
-
+void* AlignedMalloc(size_t size) {
   void* p = nullptr;
-
+  size_t alignment = 32ul;
 #ifdef PADDLE_WITH_MKLDNN
   // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
   // memory alignment
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0, "Alloc %ld error!",
-                    size);
+  alignment = 4096ul;
+#endif
+#ifdef _WIN32
+  p = _aligned_malloc(size, alignment);
 #else
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0, "Alloc %ld error!",
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, alignment, size), 0, "Alloc %ld error!",
                     size);
 #endif
   PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size);
+  return p;
+}
+
+void* CPUAllocator::Alloc(size_t* index, size_t size) {
+  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
+  // malloc might not return nullptr if size is zero, but the returned
+  // pointer shall not be dereferenced -- so we make it nullptr.
+  if (size <= 0) return nullptr;
+
+  *index = 0;  // unlock memory
+
+  void* p = AlignedMalloc(size);
 
   if (p != nullptr) {
     if (FLAGS_use_pinned_memory) {
       *index = 1;
+#ifdef _WIN32
+      VirtualLock(p, size);
+#else
       mlock(p, size);  // lock memory
+#endif
     }
   }
 
@@ -68,7 +85,11 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
 
 void CPUAllocator::Free(void* p, size_t size, size_t index) {
   if (p != nullptr && index == 1) {
+#ifdef _WIN32
+    VirtualUnlock(p, size);
+#else
     munlock(p, size);
+#endif
   }
   free(p);
 }