cpu cmrnorm

PaddlePaddle · tianbingsz · Dec 20, 2016 · Dec 12, 2016 · Dec 13, 2016 · Dec 13, 2016
commit 529f24c262850974dd8ba4c5b7ad1a4e3e0230fc
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -381,57 +381,45 @@ void hl_avgpool_backward(const int frameCnt, const real* outGrad,
   CHECK_SYNC("hl_avgpool_backward failed");
 }
 
-__global__ void KeCMRNormFillScale(size_t nthreads, const real* in,
+__global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
                                    real* scale, size_t channels,
                                    size_t height, size_t width, size_t size,
                                    real alpha) {
-  size_t index = threadIdx.x + blockIdx.x * blockDim.x;
-  if (index < nthreads) {
-    // find out the local offset
-    size_t w = index % width;
-    size_t h = (index / width) % height;
-    size_t n = index / width / height;
-    size_t offset = (n * channels * height + h) * width + w;
-    size_t step = height * width;
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < imageSize) {
+    const int w = idx % width;
+    const int h = (idx / width) % height;
+    const int n = idx / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+
     in += offset;
     scale += offset;
-    size_t head = 0;
-    size_t pre_pad = (size - 1) / 2;
-    size_t post_pad = size - pre_pad - 1;
-    real accum_scale = 0;
-    // fill the scale at [n, :, h, w]
-    // accumulate values
-    while (head < post_pad) {
-      accum_scale += in[head * step] * in[head * step];
-      ++head;
-    }
-    // until we reach size, nothing needs to be subtracted
-    while (head < size) {
-      accum_scale += in[head * step] * in[head * step];
-      scale[(head - post_pad) * step] = 1. + accum_scale * alpha;
-      ++head;
-    }
-    // both add and subtract
-    while (head < channels) {
-      accum_scale += in[head * step] * in[head * step];
-      accum_scale -= in[(head - size) * step] * in[(head - size) * step];
-      scale[(head - post_pad) * step] = 1. + accum_scale * alpha;
-      ++head;
-    }
-    // subtract only
-    while (head < channels + post_pad) {
-      accum_scale -= in[(head - size) * step] * in[(head - size) * step];
-      scale[(head - post_pad) * step] = 1. + accum_scale * alpha;
-      ++head;
+    const int step = height * width;
+    const int pre_pad = (size - 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+
+    real accum = 0;
+    int index = 0;
+    while (index < channels + post_pad) {
+      if (index < channels) {
+        accum += in[index * step] * in[index * step];
+      }
+      if (index >= size) {
+        accum -= in[(index - size) * step] * in[(index - size) * step];
+      }
+      if (index >= post_pad) {
+        scale[(index - post_pad) * step] = 1. + accum * alpha;
+      }
+      ++index;
     }
   }
 }
 
- __global__ void KeCMRNormOutput(size_t nthreads, const real* in,
-                                 const real* scale, real negative_beta,
-                                 real* out) {
-  size_t index = threadIdx.x + blockIdx.x * blockDim.x;
-  if (index < nthreads) {
+__global__ void KeCMRNormOutput(size_t inputSize, const real* in,
+                                const real* scale, real negative_beta,
+                                real* out) {
+  const int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < inputSize) {
     out[index] = in[index] * pow(scale[index], negative_beta);
   }
 }
@@ -440,84 +428,60 @@ void hl_CMRNorm_forward(size_t frameCnt, const real* in, real* scale,
                         real* out, size_t channels,
                         size_t height, size_t width, size_t sizeX,
                         real alpha, real beta) {
-  size_t threadsNum = frameCnt * height * width;
-  size_t blocksX = (threadsNum + 1024 - 1) / 1024;
-  size_t blocksY = 1;
-  dim3 threads(1024, 1);
-  dim3 grid(blocksX, blocksY);
-
-  KeCMRNormFillScale<<<grid, threads, 0, STREAM_DEFAULT>>>
-      (threadsNum, in, scale, channels, height, width, sizeX, alpha);
-
-  threadsNum = frameCnt * height * width *channels;
-  blocksX = (threadsNum + 1024 -1) / 1024;
-  dim3 threads2(1024, 1);
-  dim3 grid2(blocksX, blocksY);
-  KeCMRNormOutput<<<grid2, threads2, 0, STREAM_DEFAULT>>>
-           (threadsNum, in, scale, beta, out);
+  size_t imageSize = frameCnt * height * width;
+  int blockSize = 1024;
+  int gridSize = (imageSize + 1024 - 1) / 1024;
+  KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+      (imageSize, in, scale, channels, height, width, sizeX, alpha);
+
+  size_t inputSize = frameCnt * height * width *channels;
+  blockSize = 1024;
+  gridSize = (inputSize + 1024 - 1) / 1024;
+  KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+           (inputSize, in, scale, beta, out);
   CHECK_SYNC("hl_CMRNorm_forward");
 }
 
-__global__ void KeCMRNormDiff(size_t nthreads, const real* bottom_data,
+__global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
                               const real* top_data, const real* scale,
                               const real* top_diff, size_t channels,
                               size_t height, size_t width, size_t size,
                               real negative_beta, real cache_ratio,
                               real* bottom_diff ) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  if (index < nthreads) {
-    // find out the local offset
-    size_t w = index % width;
-    size_t h = (index / width) % height;
-    size_t n = index / width / height;
-    size_t offset = (n * channels * height + h) * width + w;
-    size_t step = height * width;
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < imageSize) {
+    const int w = idx % width;
+    const int h = (idx / width) % height;
+    const int n = idx / width / height;
+    const int offset = (n * channels * height + h) * width + w;
     bottom_data += offset;
     top_data += offset;
     scale += offset;
     top_diff += offset;
     bottom_diff += offset;
-    int head = 0;
-    int pre_pad = size - (size + 1) / 2;
-    int post_pad = size - pre_pad - 1;
-    real accum_ratio = 0;
-    // accumulate values
-    while (head < post_pad) {
-      accum_ratio += top_diff[head * step] *
-        top_data[head * step] / scale[head * step];
-      ++head;
-    }
-    // until we reach size, nothing needs to be subtracted
-    while (head < size) {
-      accum_ratio += top_diff[head * step] *
-        top_data[head * step] / scale[head * step];
-      bottom_diff[(head - post_pad) * step] +=
-        top_diff[(head - post_pad) * step] *
-        pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
-        bottom_data[(head - post_pad) * step] * accum_ratio;
-      ++head;
-    }
-    // both add and subtract
-    while (head < channels) {
-      accum_ratio += top_diff[head * step] * top_data[head * step] /
-          scale[head * step];
-      accum_ratio -= top_diff[(head - size) * step] *
-          top_data[(head - size) * step] / scale[(head - size) * step];
-      bottom_diff[(head - post_pad) * step] +=
-        top_diff[(head - post_pad) * step] *
-        pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
-        bottom_data[(head - post_pad) * step] * accum_ratio;
-      ++head;
-    }
-    // subtract only
-    while (head < channels + post_pad) {
-      accum_ratio -= top_diff[(head - size) * step] *
-          top_data[(head - size) * step] / scale[(head - size) * step];
-      bottom_diff[(head - post_pad) * step] +=
-        top_diff[(head - post_pad) * step] *
-        pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
-        bottom_data[(head - post_pad) * step] * accum_ratio;
-      ++head;
+
+    const int step = height * width;
+    const int pre_pad = size - (size + 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+
+    int index = 0;
+    real accum = 0;
+    while (index < channels + post_pad) {
+      if (index < channels) {
+        accum += top_diff[index * step] * top_data[index * step] /
+          scale[index * step];
+      }
+      if (index >= size) {
+        accum -= top_diff[(index - size) * step] *
+          top_data[(index - size) * step] / scale[(index - size) * step];
+      }
+      if (index >= post_pad) {
+        bottom_diff[(index - post_pad) * step] +=
+          top_diff[(index - post_pad) * step] *
+          pow(scale[(index - post_pad) * step], negative_beta) - cache_ratio *
+          bottom_data[(index - post_pad) * step] * accum;
+      }
+      ++index;
     }
   }
 }
@@ -528,14 +492,12 @@ void hl_CMRNorm_backward(size_t frameCnt, const real* inV,
                          real *inDiff, size_t channels,
                          size_t height, size_t width, size_t sizeX,
                          real alpha, real beta) {
-  size_t threadsNum = frameCnt * height * width;
-  size_t blocksX = (threadsNum + 1024 - 1) / 1024;
-  size_t blocksY = 1;
-  dim3 threads(1024, 1);
-  dim3 grid(blocksX, blocksY);
-  KeCMRNormDiff <<<grid, threads, 0, STREAM_DEFAULT>>>
-           (threadsNum, inV, outV, scale, outDiff, channels,
-           height, width, sizeX, alpha, beta, inDiff);
+  size_t imageSize = frameCnt * height * width;
+  int blockSize = 1024;
+  int gridSize = (imageSize + 1024 - 1) / 1024;
+  KeCMRNormDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+    (imageSize, inV, outV, scale, outDiff, channels,
+      height, width, sizeX, alpha, beta, inDiff);
   CHECK_SYNC("hl_CMRNorm_backward");
 }
 

diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1021,11 +1021,10 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
   testLayerGrad(config, "norm", 100, trans, useGpu);
 }
 
-#ifndef PADDLE_ONLY_CPU
 TEST(Layer, NormLayer) {
   testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */ true);
+  testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */ false);
 }
-#endif
 
 void setPoolConfig(TestConfig* config,
                    PoolConfig* pool,