Skip to content

Commit

Permalink
Merge pull request #8 from semi-h/distributed
Browse files Browse the repository at this point in the history
Multi-GPU support for the distributed solver tests
  • Loading branch information
semi-h committed Sep 25, 2023
2 parents 549f935 + f062cff commit d826250
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 16 deletions.
17 changes: 12 additions & 5 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,24 @@ set(CUDATESTSRC
)

if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI")
list(APPEND TESTSRC ${CUDATESTSRC})
foreach(testfile IN LISTS CUDATESTSRC)
get_filename_component(test_name ${testfile} NAME_WE)

add_executable(${test_name} ${testfile})
target_compile_options(${test_name} PRIVATE "-cuda")
target_compile_options(${test_name} PRIVATE "-O3")
target_compile_options(${test_name} PRIVATE "-fast")
target_link_options(${test_name} INTERFACE "-cuda")
target_link_libraries(${test_name} PRIVATE x3d2)

add_test(NAME ${test_name} COMMAND sh -c "mpirun -np 1 ${test_name}")
endforeach()
endif()

foreach(testfile IN LISTS TESTSRC)
get_filename_component(test_name ${testfile} NAME_WE)

add_executable(${test_name} ${testfile})
target_compile_options(${test_name} PRIVATE "-cuda")
target_compile_options(${test_name} PRIVATE "-O3")
target_compile_options(${test_name} PRIVATE "-fast")
target_link_options(${test_name} INTERFACE "-cuda")
target_link_libraries(${test_name} PRIVATE x3d2)

add_test(NAME ${test_name} COMMAND ${test_name})
Expand Down
81 changes: 70 additions & 11 deletions tests/cuda/test_cuda_tridiag.f90
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
program test_cuda_tridiag
use iso_fortran_env, only: stderr => error_unit
use cudafor
use mpi

use m_common, only: dp, SZ, pi
use m_cuda_kernels_dist, only: der_univ_dist, der_univ_subs
Expand All @@ -27,25 +28,45 @@ program test_cuda_tridiag
dist_sa_dev, dist_sc_dev

integer :: n, n_block, i, j, k, n_halo, n_stencil, n_iters
integer :: ierr, memClockRt, memBusWidth
integer :: n_glob
integer :: nrank, nproc, pprev, pnext, tag1=1234, tag2=1234
integer, allocatable :: srerr(:), mpireq(:)
integer :: ierr, ndevs, devnum, memClockRt, memBusWidth

type(dim3) :: blocks, threads
real(dp) :: dx, dx2, alfa, norm_du, tol = 1d-8, tstart, tend
real(dp) :: achievedBW, deviceBW

n = 512
call MPI_Init(ierr)
call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr)
call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr)

if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks'

ierr = cudaGetDeviceCount(ndevs)
ierr = cudaSetDevice(mod(nrank, ndevs)) ! round-robin
ierr = cudaGetDevice(devnum)
print*, 'I am rank', nrank, 'I am running on device', devnum
pnext = modulo(nrank-nproc+1, nproc)
pprev = modulo(nrank-1, nproc)
print*, 'rank', nrank, 'pnext', pnext, 'pprev', pprev
allocate(srerr(nproc), mpireq(nproc))

n_glob = 512*4
n = n_glob/nproc
n_block = 512*512/SZ
n_iters = 1000

allocate(u(SZ, n, n_block), du(SZ, n, n_block))
allocate(u_dev(SZ, n, n_block), du_dev(SZ, n, n_block))

dx = 2*pi/n
dx = 2*pi/n_glob
dx2 = dx*dx

do k = 1, n_block
do j = 1, n
do i = 1, SZ
u(i, j, k) = sin((j-1)*dx)
u(i, j, k) = sin((j-1+nrank*n)*dx)
end do
end do
end do
Expand All @@ -61,9 +82,10 @@ program test_cuda_tridiag

allocate(coeffs_b_dev(n_halo, n_stencil), coeffs_e_dev(n_halo, n_stencil))
allocate(coeffs_dev(n_stencil))
allocate(dist_fr_dev(n), dist_bc_dev(n), dist_sa_dev(n), dist_sc_dev(n))
coeffs_b_dev(:, :) = coeffs_b(:, :); coeffs_e_dev(:, :) = coeffs_e(:, :)
coeffs_dev(:) = coeffs(:)

allocate(dist_fr_dev(n), dist_bc_dev(n), dist_sa_dev(n), dist_sc_dev(n))
dist_fr_dev(:) = dist_fr(:); dist_bc_dev(:) = dist_bc(:)
dist_sa_dev(:) = dist_sa(:); dist_sc_dev(:) = dist_sc(:)

Expand All @@ -84,19 +106,56 @@ program test_cuda_tridiag
u_send_b_dev(:,:,:) = u_dev(:,1:4,:)
u_send_e_dev(:,:,:) = u_dev(:,n-n_halo+1:n,:)

! to be converted to MPI send/recv for multi-rank simulations
u_recv_b_dev = u_send_e_dev
u_recv_e_dev = u_send_b_dev
! halo exchange
if (nproc == 1) then
u_recv_b_dev = u_send_e_dev
u_recv_e_dev = u_send_b_dev
else
! MPI send/recv for multi-rank simulations
call MPI_Isend(u_send_b_dev, SZ*n_halo*n_block, &
MPI_DOUBLE_PRECISION, pprev, tag1, MPI_COMM_WORLD, &
mpireq(1), srerr(1))
call MPI_Irecv(u_recv_e_dev, SZ*n_halo*n_block, &
MPI_DOUBLE_PRECISION, pnext, tag1, MPI_COMM_WORLD, &
mpireq(2), srerr(2))
call MPI_Isend(u_send_e_dev, SZ*n_halo*n_block, &
MPI_DOUBLE_PRECISION, pnext, tag2, MPI_COMM_WORLD, &
mpireq(3), srerr(3))
call MPI_Irecv(u_recv_b_dev, SZ*n_halo*n_block, &
MPI_DOUBLE_PRECISION, pprev, tag2, MPI_COMM_WORLD, &
mpireq(4), srerr(4))

call MPI_Waitall(4, mpireq, MPI_STATUSES_IGNORE, ierr)
end if


call der_univ_dist<<<blocks, threads>>>( &
du_dev, send_b_dev, send_e_dev, u_dev, u_recv_b_dev, u_recv_e_dev, &
coeffs_b_dev, coeffs_e_dev, coeffs_dev, &
n, alfa, dist_fr_dev, dist_bc_dev &
)

! to be converted to MPI send/recv for multi-rank simulations
recv_b_dev = send_e_dev
recv_e_dev = send_b_dev
! halo exchange for 2x2 systems
if (nproc == 1) then
recv_b_dev = send_e_dev
recv_e_dev = send_b_dev
else
! MPI send/recv for multi-rank simulations
call MPI_Isend(send_b_dev, SZ*n_block, &
MPI_DOUBLE_PRECISION, pprev, tag1, MPI_COMM_WORLD, &
mpireq(1), srerr(1))
call MPI_Irecv(recv_e_dev, SZ*n_block, &
MPI_DOUBLE_PRECISION, pnext, tag2, MPI_COMM_WORLD, &
mpireq(2), srerr(2))
call MPI_Isend(send_e_dev, SZ*n_block, &
MPI_DOUBLE_PRECISION, pnext, tag2, MPI_COMM_WORLD, &
mpireq(3), srerr(3))
call MPI_Irecv(recv_b_dev, SZ*n_block, &
MPI_DOUBLE_PRECISION, pprev, tag1, MPI_COMM_WORLD, &
mpireq(4), srerr(4))

call MPI_Waitall(4, mpireq, MPI_STATUSES_IGNORE, ierr)
end if

call der_univ_subs<<<blocks, threads>>>( &
du_dev, recv_b_dev, recv_e_dev, &
Expand Down

0 comments on commit d826250

Please sign in to comment.