Skip to content

Commit

Permalink
Check CUDA kernel launches (/fbcode/caffe2/) (pytorch#49145)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch#49145

Pull Request resolved: pytorch#49105

(1) Add a safety check `C10_CUDA_KERNEL_LAUNCH_CHECK()` after each kernel launch. This diff only changes the files inside the directory /fbsource/fbcode/caffe2/modules/, /fbsource/fbcode/caffe2/fb/, /fbsource/fbcode/caffe2/test/.

(2) Get rid of old check `AT_CUDA_CHECK(cudaGetLastError())` when necessary.

Test Plan:
Test build:
```
buck build mode/dev-nosan //caffe2/modules/detectron:
buck test mode/dev-nosan //caffe2/modules/detectron:
buck build mode/dev-nosan //caffe2/torch/fb/:
buck test mode/dev-nosan //caffe2/torch/fb/:
```

To check for launches without checks:
```
python3 caffe2/torch/testing/check_kernel_launches.py
```
Make sure none of the updated files are in the returned list.

Reviewed By: r-barnes

Differential Revision: D25452852

fbshipit-source-id: d6657edab612c9e0fa99b29c68460be8b1a20064
  • Loading branch information
Yixin Bao authored and facebook-github-bot committed Dec 10, 2020
1 parent 524adfb commit 840e71f
Show file tree
Hide file tree
Showing 14 changed files with 36 additions and 0 deletions.
3 changes: 3 additions & 0 deletions modules/detectron/group_spatial_softmax_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ bool GroupSpatialSoftmaxOp<float, CUDAContext>::RunOnDevice() {
GroupSpatialSoftmaxKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
N, A, W, H, Xdata, Pdata, num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}

Expand Down Expand Up @@ -158,11 +159,13 @@ bool GroupSpatialSoftmaxGradientOp<float, CUDAContext>::RunOnDevice() {
SumProbsKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0,
context_.cuda_stream()>>>(
N, A, W, H, Ydata, dYdata, sum_probs_data, num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();

// Step 2: dX[i] = dX[i] - s
SubSumKernel<<<CAFFE_GET_BLOCKS(Y.size()), CAFFE_CUDA_NUM_THREADS, 0,
context_.cuda_stream()>>>(
N, A, W, H, sum_probs_.data<float>(), dXdata, num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();

// Step 3: dX[i] = Y[i] * dX[i]
math::Mul<float, CUDAContext>(Y.size(), dXdata, Ydata, dXdata, &context_);
Expand Down
2 changes: 2 additions & 0 deletions modules/detectron/ps_roi_pool_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ bool PSRoIPoolOp<float, CUDAContext>::RunOnDevice() {
output_size, X.data<float>(), spatial_scale_, X.dim32(1), X.dim32(2),
X.dim32(3), pooled_height_, pooled_width_, R.data<float>(), output_dim_,
group_size_, Y->mutable_data<float>(), A->mutable_data<int>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}

Expand All @@ -276,6 +277,7 @@ bool PSRoIPoolGradientOp<float, CUDAContext>::RunOnDevice() {
dY.size(), dY.data<float>(), A.data<int>(), R.dim32(0), spatial_scale_,
X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_,
output_dim_, dX->mutable_data<float>(), R.data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}

Expand Down
2 changes: 2 additions & 0 deletions modules/detectron/roi_pool_f_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ bool RoIPoolFOp<float, CUDAContext>::RunOnDevice() {
output_size, X.data<float>(), spatial_scale_, X.dim32(1), X.dim32(2),
X.dim32(3), pooled_height_, pooled_width_, R.data<float>(),
Y->mutable_data<float>(), A->mutable_data<int>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}

Expand All @@ -173,6 +174,7 @@ bool RoIPoolFGradientOp<float, CUDAContext>::RunOnDevice() {
dY.size(), dY.data<float>(), A.data<int>(), R.dim32(0), spatial_scale_,
X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_,
dX->mutable_data<float>(), R.data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
return true;
}
Expand Down
2 changes: 2 additions & 0 deletions modules/detectron/select_smooth_l1_loss_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ bool SelectSmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
M, Y_hat.data<float>(), Y.data<float>(),
L.data<float>(), buff_.mutable_data<float>(),
S.data<float>(), beta_);
C10_CUDA_KERNEL_LAUNCH_CHECK();

// Sum of all losses
// al := sum_i l_i
Expand Down Expand Up @@ -175,6 +176,7 @@ bool SelectSmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
D, H, W, M, Y_hat.data<float>(), Y.data<float>(),
L.data<float>(), d_Y_hat->mutable_data<float>(),
d_avg_loss.data<float>(), scale_, S.data<float>(), beta_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
Expand Down
5 changes: 5 additions & 0 deletions modules/detectron/sigmoid_cross_entropy_loss_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ bool SigmoidCrossEntropyLossOp<float, CUDAContext>::RunOnDevice() {
T.data<int>(),
losses_.mutable_data<float>(),
counts_.mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
float* avg_loss_data = avg_loss->mutable_data<float>();
math::Sum<float, CUDAContext>(
losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
Expand All @@ -106,6 +108,7 @@ bool SigmoidCrossEntropyLossOp<float, CUDAContext>::RunOnDevice() {
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5);
C10_CUDA_KERNEL_LAUNCH_CHECK();
math::Div<float, CUDAContext>(
1, avg_loss_data, normalizer_data, avg_loss_data, &context_);
}
Expand Down Expand Up @@ -135,6 +138,7 @@ bool SigmoidCrossEntropyLossGradientOp<float, CUDAContext>::RunOnDevice() {
T.data<int>(),
dX->mutable_data<float>(),
counts_.mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
if (normalize_) {
float* normalizer_data = normalizer_.mutable_data<float>();
math::Sum<float, CUDAContext>(
Expand All @@ -145,6 +149,7 @@ bool SigmoidCrossEntropyLossGradientOp<float, CUDAContext>::RunOnDevice() {
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5);
C10_CUDA_KERNEL_LAUNCH_CHECK();
math::Div<float, CUDAContext>(
1,
d_avg_loss.data<float>(),
Expand Down
2 changes: 2 additions & 0 deletions modules/detectron/sigmoid_focal_loss_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ bool SigmoidFocalLossOp<float, CUDAContext>::RunOnDevice() {
N, D, H, W, X.data<float>(), T.data<int>(),
wp.data<float>(), gamma_, alpha_, num_classes_,
losses_.mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();

math::Sum<float, CUDAContext>(
losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
Expand Down Expand Up @@ -165,6 +166,7 @@ bool SigmoidFocalLossGradientOp<float, CUDAContext>::RunOnDevice() {
N, D, H, W, X.data<float>(), T.data<int>(), dX->mutable_data<float>(),
wp.data<float>(), gamma_, alpha_, num_classes_,
d_avg_loss.data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
math::Scale<float, float, CUDAContext>(
dX->size(),
scale_,
Expand Down
3 changes: 3 additions & 0 deletions modules/detectron/smooth_l1_loss_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ bool SmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
context_.cuda_stream()>>>(
buff_.size(), buff_.data<float>(), buff_.mutable_data<float>(),
beta_);
C10_CUDA_KERNEL_LAUNCH_CHECK();

// Element-wise weighted smooth l1 loss (can be used to specify a per-element
// loss weight)
Expand Down Expand Up @@ -164,6 +165,8 @@ bool SmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
context_.cuda_stream()>>>(
buff_.size(), buff_.data<float>(), d_Y_hat->mutable_data<float>(),
d_avg_loss.data<float>(), scale_ / N, beta_);
C10_CUDA_KERNEL_LAUNCH_CHECK();

// Element-wise scale by alpha_in and alpha_out
math::Mul<float, CUDAContext>(
d_Y_hat->size(), d_Y_hat->data<float>(), alpha_in.data<float>(),
Expand Down
5 changes: 5 additions & 0 deletions modules/detectron/softmax_focal_loss_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ bool SoftmaxFocalLossOp<float, CUDAContext>::RunOnDevice() {
<<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
N, A, H, W, Xdata, P->mutable_data<float>(), num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();

// Compute loss for each x,y location
const int* Tdata = T.data<int>();
Expand All @@ -184,6 +185,7 @@ bool SoftmaxFocalLossOp<float, CUDAContext>::RunOnDevice() {
0, context_.cuda_stream()>>>(
N, A, H, W, P->data<float>(), Tdata, losses_.mutable_data<float>(),
Wdata, gamma_, alpha_, num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();

// sum the losses
float* avg_loss_data = avg_loss->mutable_data<float>();
Expand Down Expand Up @@ -227,13 +229,16 @@ bool SoftmaxFocalLossGradientOp<float, CUDAContext>::RunOnDevice() {
0, context_.cuda_stream()>>>(
N, A, H, W, Pdata, Tdata, buff_.mutable_data<float>(),
Wdata, gamma_, alpha_, num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();

// Compute the gradient with the weights
const float* Bdata = buff_.data<float>();
SoftmaxFocalLossGradientKernel
<<<CAFFE_GET_BLOCKS(N * D * H * W), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
N, D, H, W, Pdata, Tdata, Bdata, d_avg_loss.data<float>(),
dX->mutable_data<float>(), num_classes_);
C10_CUDA_KERNEL_LAUNCH_CHECK();
math::Scale<float, float, CUDAContext>(
dX->size(),
scale_,
Expand Down
2 changes: 2 additions & 0 deletions modules/detectron/spatial_narrow_as_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ bool SpatialNarrowAsOp<CUDAContext>::DoRunWithType() {
out_width,
A.template data<T>(),
C->template mutable_data<T>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
Expand Down Expand Up @@ -152,6 +153,7 @@ bool SpatialNarrowAsGradientOp<CUDAContext>::DoRunWithType() {
out_width,
dC.template data<T>(),
dA->template mutable_data<T>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
Expand Down
3 changes: 3 additions & 0 deletions modules/detectron/upsample_nearest_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,8 @@ bool UpsampleNearestOp<float, CUDAContext>::RunOnDevice() {

upscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
input_data, output_data, no_elements, scale_, d1, d2, d3);
C10_CUDA_KERNEL_LAUNCH_CHECK();

return true;
}

Expand Down Expand Up @@ -209,6 +211,7 @@ bool UpsampleNearestGradientOp<float, CUDAContext>::RunOnDevice() {
math::Set<float, CUDAContext>(no_elements, 0.f, gradInput_data, &context_);
downscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
gradInput_data, gradOutput_data, no_elements, scale_, d1, d2, d3);
C10_CUDA_KERNEL_LAUNCH_CHECK();

return true;
}
Expand Down
2 changes: 2 additions & 0 deletions test/cpp_extensions/cuda_extension.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <cuda.h>
#include <cuda_runtime.h>
#include <c10/cuda/CUDAException.h>

#include <ATen/ATen.h>

Expand All @@ -26,4 +27,5 @@ void sigmoid_add_cuda(const float* x, const float* y, float* output, int size) {
const int threads = 1024;
const int blocks = (size + threads - 1) / threads;
sigmoid_add_kernel<<<blocks, threads>>>(x, y, output, size);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
2 changes: 2 additions & 0 deletions test/cpp_extensions/cuda_extension_kernel.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <c10/cuda/CUDAException.h>

#include <ATen/ATen.h>

Expand All @@ -20,4 +21,5 @@ void sigmoid_add_cuda(const float* x, const float* y, float* output, int size) {
const int threads = 1024;
const int blocks = (size + threads - 1) / threads;
sigmoid_add_kernel<<<blocks, threads>>>(x, y, output, size);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
2 changes: 2 additions & 0 deletions test/cpp_extensions/cuda_extension_kernel2.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <c10/cuda/CUDAException.h>

#include <ATen/ATen.h>

Expand All @@ -20,4 +21,5 @@ void tanh_add_cuda(const float* x, const float* y, float* output, int size) {
const int threads = 1024;
const int blocks = (size + threads - 1) / threads;
tanh_add_kernel<<<blocks, threads>>>(x, y, output, size);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
1 change: 1 addition & 0 deletions torch/lib/c10d/test/CUDATest.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ __global__ void waitClocks(const uint64_t count) {

void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks) {
waitClocks<<<1, 1, 0, stream.stream()>>>(clocks);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}

int cudaNumDevices() {
Expand Down

0 comments on commit 840e71f

Please sign in to comment.