Skip to content

Commit

Permalink
fix AvgPool2d for 2^31-1 sized inputs, and get test_cuda_kernel_loop_… (
Browse files Browse the repository at this point in the history
pytorch#30771)

Summary:
…overflow_large to working state
Pull Request resolved: pytorch#30771

Differential Revision: D18821529

Pulled By: ngimel

fbshipit-source-id: c5cbf56e686a2a3cfc7274dd96db37289dac7588
  • Loading branch information
soumith authored and facebook-github-bot committed Dec 5, 2019
1 parent 1d20c32 commit a939b52
Showing 1 changed file with 15 additions and 17 deletions.
32 changes: 15 additions & 17 deletions aten/src/ATen/native/cuda/AveragePool2d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,9 @@ void avg_pool2d_out_cuda_template(

output.resize_({nbatch, nInputPlane, outputHeight, outputWidth});

const int count = safe_downcast<int, int64_t>(output.numel());
const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
const int32_t count = safe_downcast<int32_t, int64_t>(output.numel());
const uint32_t num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
const uint32_t num_blocks = cuda::ATenCeilDiv<uint32_t>(count, num_threads);

if (divisor_override.has_value()) {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(),
Expand All @@ -184,7 +185,7 @@ void avg_pool2d_out_cuda_template(
scalar_t *input_data = input.data_ptr<scalar_t>();

avg_pool2d_out_cuda_frame<scalar_t, accscalar_t, false, true>
<<<cuda::ATenCeilDiv(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
<<<num_blocks, num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
count,
input_data,
nbatch,
Expand All @@ -209,7 +210,7 @@ void avg_pool2d_out_cuda_template(
scalar_t *input_data = input.data_ptr<scalar_t>();

avg_pool2d_out_cuda_frame<scalar_t, accscalar_t, true, false>
<<<cuda::ATenCeilDiv(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
<<<num_blocks, num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
count,
input_data,
nbatch,
Expand All @@ -233,7 +234,7 @@ void avg_pool2d_out_cuda_template(
scalar_t *input_data = input.data_ptr<scalar_t>();

avg_pool2d_out_cuda_frame<scalar_t, accscalar_t, false, false>
<<<cuda::ATenCeilDiv(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
<<<num_blocks, num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
count,
input_data,
nbatch,
Expand All @@ -249,10 +250,8 @@ void avg_pool2d_out_cuda_template(
}
}


TORCH_CHECK(cudaGetLastError() == cudaSuccess,
"avg_pool2d_out_cuda_frame failed with error code ",
cudaGetLastError());

THCudaCheck(cudaGetLastError());

if (input.ndimension() == 3) {
output.resize_({nInputPlane, outputHeight, outputWidth});
Expand Down Expand Up @@ -322,8 +321,9 @@ Tensor& avg_pool2d_backward_out_cuda_template(

gradInput.resize_as_(input);

const int count = safe_downcast<int, int64_t>(input.numel());
const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
const int32_t count = safe_downcast<int32_t, int64_t>(input.numel());
const uint32_t num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
const uint32_t num_blocks = cuda::ATenCeilDiv<uint32_t>(count, num_threads);

if (divisor_override.has_value()) {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(),
Expand All @@ -335,7 +335,7 @@ Tensor& avg_pool2d_backward_out_cuda_template(
scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();

avg_pool2d_backward_out_cuda_frame<scalar_t, accscalar_t, false, true>
<<<cuda::ATenCeilDiv(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
<<<num_blocks, num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
count,
gradOutput_data,
nbatch,
Expand All @@ -360,7 +360,7 @@ Tensor& avg_pool2d_backward_out_cuda_template(
scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();

avg_pool2d_backward_out_cuda_frame<scalar_t, accscalar_t, true, false>
<<<cuda::ATenCeilDiv(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
<<<num_blocks, num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
count,
gradOutput_data,
nbatch,
Expand All @@ -384,7 +384,7 @@ Tensor& avg_pool2d_backward_out_cuda_template(
scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();

avg_pool2d_backward_out_cuda_frame<scalar_t, accscalar_t, false, false>
<<<cuda::ATenCeilDiv(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
<<<num_blocks, num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
count,
gradOutput_data,
nbatch,
Expand All @@ -400,9 +400,7 @@ Tensor& avg_pool2d_backward_out_cuda_template(
}
}

TORCH_CHECK(cudaGetLastError() == cudaSuccess,
"avg_pool2d_backward_out_cuda failed with error code ",
cudaGetLastError());
THCudaCheck(cudaGetLastError());

return gradInput;
}
Expand Down

0 comments on commit a939b52

Please sign in to comment.