diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7659a869..217e236e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -10,17 +10,24 @@ set(CUDATESTSRC ) if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI") - list(APPEND TESTSRC ${CUDATESTSRC}) + foreach(testfile IN LISTS CUDATESTSRC) + get_filename_component(test_name ${testfile} NAME_WE) + + add_executable(${test_name} ${testfile}) + target_compile_options(${test_name} PRIVATE "-cuda") + target_compile_options(${test_name} PRIVATE "-O3") + target_compile_options(${test_name} PRIVATE "-fast") + target_link_options(${test_name} INTERFACE "-cuda") + target_link_libraries(${test_name} PRIVATE x3d2) + + add_test(NAME ${test_name} COMMAND sh -c "mpirun -np 1 ${test_name}") + endforeach() endif() foreach(testfile IN LISTS TESTSRC) get_filename_component(test_name ${testfile} NAME_WE) add_executable(${test_name} ${testfile}) - target_compile_options(${test_name} PRIVATE "-cuda") - target_compile_options(${test_name} PRIVATE "-O3") - target_compile_options(${test_name} PRIVATE "-fast") - target_link_options(${test_name} INTERFACE "-cuda") target_link_libraries(${test_name} PRIVATE x3d2) add_test(NAME ${test_name} COMMAND ${test_name}) diff --git a/tests/cuda/test_cuda_tridiag.f90 b/tests/cuda/test_cuda_tridiag.f90 index ce0f9758..116be9ec 100644 --- a/tests/cuda/test_cuda_tridiag.f90 +++ b/tests/cuda/test_cuda_tridiag.f90 @@ -1,6 +1,7 @@ program test_cuda_tridiag use iso_fortran_env, only: stderr => error_unit use cudafor + use mpi use m_common, only: dp, SZ, pi use m_cuda_kernels_dist, only: der_univ_dist, der_univ_subs @@ -27,25 +28,45 @@ program test_cuda_tridiag dist_sa_dev, dist_sc_dev integer :: n, n_block, i, j, k, n_halo, n_stencil, n_iters - integer :: ierr, memClockRt, memBusWidth + integer :: n_glob + integer :: nrank, nproc, pprev, pnext, tag1=1234, tag2=1234 + integer, allocatable :: srerr(:), mpireq(:) + integer :: ierr, ndevs, devnum, memClockRt, memBusWidth + type(dim3) :: blocks, threads real(dp) :: dx, dx2, alfa, norm_du, tol = 1d-8, tstart, tend real(dp) :: achievedBW, deviceBW - n = 512 + call MPI_Init(ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) + + if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks' + + ierr = cudaGetDeviceCount(ndevs) + ierr = cudaSetDevice(mod(nrank, ndevs)) ! round-robin + ierr = cudaGetDevice(devnum) + print*, 'I am rank', nrank, 'I am running on device', devnum + pnext = modulo(nrank-nproc+1, nproc) + pprev = modulo(nrank-1, nproc) + print*, 'rank', nrank, 'pnext', pnext, 'pprev', pprev + allocate(srerr(nproc), mpireq(nproc)) + + n_glob = 512*4 + n = n_glob/nproc n_block = 512*512/SZ n_iters = 1000 allocate(u(SZ, n, n_block), du(SZ, n, n_block)) allocate(u_dev(SZ, n, n_block), du_dev(SZ, n, n_block)) - dx = 2*pi/n + dx = 2*pi/n_glob dx2 = dx*dx do k = 1, n_block do j = 1, n do i = 1, SZ - u(i, j, k) = sin((j-1)*dx) + u(i, j, k) = sin((j-1+nrank*n)*dx) end do end do end do @@ -61,9 +82,10 @@ program test_cuda_tridiag allocate(coeffs_b_dev(n_halo, n_stencil), coeffs_e_dev(n_halo, n_stencil)) allocate(coeffs_dev(n_stencil)) - allocate(dist_fr_dev(n), dist_bc_dev(n), dist_sa_dev(n), dist_sc_dev(n)) coeffs_b_dev(:, :) = coeffs_b(:, :); coeffs_e_dev(:, :) = coeffs_e(:, :) coeffs_dev(:) = coeffs(:) + + allocate(dist_fr_dev(n), dist_bc_dev(n), dist_sa_dev(n), dist_sc_dev(n)) dist_fr_dev(:) = dist_fr(:); dist_bc_dev(:) = dist_bc(:) dist_sa_dev(:) = dist_sa(:); dist_sc_dev(:) = dist_sc(:) @@ -84,9 +106,28 @@ program test_cuda_tridiag u_send_b_dev(:,:,:) = u_dev(:,1:4,:) u_send_e_dev(:,:,:) = u_dev(:,n-n_halo+1:n,:) - ! to be converted to MPI send/recv for multi-rank simulations - u_recv_b_dev = u_send_e_dev - u_recv_e_dev = u_send_b_dev + ! halo exchange + if (nproc == 1) then + u_recv_b_dev = u_send_e_dev + u_recv_e_dev = u_send_b_dev + else + ! MPI send/recv for multi-rank simulations + call MPI_Isend(u_send_b_dev, SZ*n_halo*n_block, & + MPI_DOUBLE_PRECISION, pprev, tag1, MPI_COMM_WORLD, & + mpireq(1), srerr(1)) + call MPI_Irecv(u_recv_e_dev, SZ*n_halo*n_block, & + MPI_DOUBLE_PRECISION, pnext, tag1, MPI_COMM_WORLD, & + mpireq(2), srerr(2)) + call MPI_Isend(u_send_e_dev, SZ*n_halo*n_block, & + MPI_DOUBLE_PRECISION, pnext, tag2, MPI_COMM_WORLD, & + mpireq(3), srerr(3)) + call MPI_Irecv(u_recv_b_dev, SZ*n_halo*n_block, & + MPI_DOUBLE_PRECISION, pprev, tag2, MPI_COMM_WORLD, & + mpireq(4), srerr(4)) + + call MPI_Waitall(4, mpireq, MPI_STATUSES_IGNORE, ierr) + end if + call der_univ_dist<<>>( & du_dev, send_b_dev, send_e_dev, u_dev, u_recv_b_dev, u_recv_e_dev, & @@ -94,9 +135,27 @@ program test_cuda_tridiag n, alfa, dist_fr_dev, dist_bc_dev & ) - ! to be converted to MPI send/recv for multi-rank simulations - recv_b_dev = send_e_dev - recv_e_dev = send_b_dev + ! halo exchange for 2x2 systems + if (nproc == 1) then + recv_b_dev = send_e_dev + recv_e_dev = send_b_dev + else + ! MPI send/recv for multi-rank simulations + call MPI_Isend(send_b_dev, SZ*n_block, & + MPI_DOUBLE_PRECISION, pprev, tag1, MPI_COMM_WORLD, & + mpireq(1), srerr(1)) + call MPI_Irecv(recv_e_dev, SZ*n_block, & + MPI_DOUBLE_PRECISION, pnext, tag2, MPI_COMM_WORLD, & + mpireq(2), srerr(2)) + call MPI_Isend(send_e_dev, SZ*n_block, & + MPI_DOUBLE_PRECISION, pnext, tag2, MPI_COMM_WORLD, & + mpireq(3), srerr(3)) + call MPI_Irecv(recv_b_dev, SZ*n_block, & + MPI_DOUBLE_PRECISION, pprev, tag1, MPI_COMM_WORLD, & + mpireq(4), srerr(4)) + + call MPI_Waitall(4, mpireq, MPI_STATUSES_IGNORE, ierr) + end if call der_univ_subs<<>>( & du_dev, recv_b_dev, recv_e_dev, &