Skip to content

Commit

Permalink
transpose kernel map with stride==kernel fix
Browse files Browse the repository at this point in the history
  • Loading branch information
chrischoy committed Mar 28, 2021
1 parent 8695696 commit 73a9008
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 16 deletions.
2 changes: 2 additions & 0 deletions src/coordinate_map_cpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,8 @@ class CoordinateMapCPU : public CoordinateMap<coordinate_type, TemplatedAllocato
cpu_in_maps in_maps = initialize_maps<cpu_in_map>(1, in_size);
cpu_out_maps out_maps = initialize_maps<cpu_out_map>(1, in_size);

LOG_DEBUG("stride map in_maps.size():", in_size);
LOG_DEBUG("stride map out_maps.size():", out_coordinate_map.size());
// compute the chunk size per thread.
// There's a trade-off between the thread initialization overhead and the
// job sizes. If some jobs finish earlier than others due to imbalance in
Expand Down
8 changes: 4 additions & 4 deletions src/coordinate_map_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -596,8 +596,8 @@ struct stride_map_functor<coordinate_type, std::allocator, CoordinateMapCPU,
cpu_kernel_map
operator()(CoordinateMapCPU<coordinate_type, std::allocator> const &in_map,
CoordinateMapCPU<coordinate_type, std::allocator> const &out_map,
default_types::stride_type const &stride) {
return in_map.stride_map(out_map, stride);
default_types::stride_type const &out_tensor_stride) {
return in_map.stride_map(out_map, out_tensor_stride);
}
};

Expand Down Expand Up @@ -777,7 +777,7 @@ CoordinateMapManager<
auto const stride_map =
detail::stride_map_functor<coordinate_type, TemplatedAllocator,
CoordinateMapType, kernel_map_type>()(
out_map, in_map, kernel_stride);
out_map, in_map, in_map.get_tensor_stride());

// TODO Replace the kernel_map values to shared pointers.
m_kernel_maps[kernel_map_key] =
Expand Down Expand Up @@ -1018,7 +1018,7 @@ CoordinateMapManager<coordinate_type, coordinate_field_type, TemplatedAllocator,
auto const stride_map =
detail::stride_map_functor<coordinate_type, TemplatedAllocator,
CoordinateMapType, kernel_map_type>()(
in_map, strided_map, kernel_stride);
in_map, strided_map, strided_map.get_tensor_stride());

m_kernel_maps[kernel_map_key] = std::move(stride_map);
}
Expand Down
24 changes: 12 additions & 12 deletions src/kernel_map.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -199,36 +199,36 @@ public:

index_type *p_kernel_map =
(index_type *)std::malloc(map_size * 3 * sizeof(index_type));
CUDA_CHECK(cudaMemcpy(p_kernel_map, kernels.begin(0),
// CUDA_CHECK(cudaMemcpy(p_kernel_map, kernels.begin(),
// map_size * sizeof(index_type),
// cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(p_kernel_map + 1 * map_size, in_maps.begin(),
map_size * sizeof(index_type),
cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(p_kernel_map + 1 * map_size, in_maps.begin(0),
map_size * sizeof(index_type),
cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(p_kernel_map + 2 * map_size, out_maps.begin(0),
CUDA_CHECK(cudaMemcpy(p_kernel_map + 2 * map_size, out_maps.begin(),
map_size * sizeof(index_type),
cudaMemcpyDeviceToHost));

for (index_type i = 0; i < map_size; ++i) {
std::cout << p_kernel_map[i + 0 * map_size] << ":"
std::cout // << p_kernel_map[i + 0 * map_size] << ":"
<< p_kernel_map[i + 1 * map_size] << "->"
<< p_kernel_map[i + 2 * map_size] << "\n";
}

std::cout << "Swapped kernel map\n";

CUDA_CHECK(cudaMemcpy(p_kernel_map, swapped_gpu_kernel_map.kernels.begin(0),
map_size * sizeof(index_type),
cudaMemcpyDeviceToHost));
// CUDA_CHECK(cudaMemcpy(p_kernel_map, swapped_gpu_kernel_map.kernels.begin(),
// map_size * sizeof(index_type),
// cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(
p_kernel_map + 1 * map_size, swapped_gpu_kernel_map.in_maps.begin(0),
p_kernel_map + 1 * map_size, swapped_gpu_kernel_map.in_maps.begin(),
map_size * sizeof(index_type), cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(
p_kernel_map + 2 * map_size, swapped_gpu_kernel_map.out_maps.begin(0),
p_kernel_map + 2 * map_size, swapped_gpu_kernel_map.out_maps.begin(),
map_size * sizeof(index_type), cudaMemcpyDeviceToHost));

for (index_type i = 0; i < map_size; ++i) {
std::cout << p_kernel_map[i + 0 * map_size] << ":"
std::cout // << p_kernel_map[i + 0 * map_size] << ":"
<< p_kernel_map[i + 1 * map_size] << "->"
<< p_kernel_map[i + 2 * map_size] << "\n";
}
Expand Down

0 comments on commit 73a9008

Please sign in to comment.