Skip to content

Commit

Permalink
Merge commit '1eaffd515aa517071625d9d8eecd45e91515444e' into bolt-1eaffd
Browse files Browse the repository at this point in the history
  • Loading branch information
shintaro-iwasaki committed Oct 28, 2020
2 parents c365888 + 1eaffd5 commit 92ac48c
Show file tree
Hide file tree
Showing 44 changed files with 1,057 additions and 468 deletions.
9 changes: 1 addition & 8 deletions cmake/OpenMPTesting.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,6 @@ endfunction()
if (${OPENMP_STANDALONE_BUILD})
find_standalone_test_dependencies()

# Make sure we can use the console pool for recent CMake and Ninja > 1.5.
if (CMAKE_VERSION VERSION_LESS 3.1.20141117)
set(cmake_3_2_USES_TERMINAL)
else()
set(cmake_3_2_USES_TERMINAL USES_TERMINAL)
endif()

# Set lit arguments.
set(DEFAULT_LIT_ARGS "-sv --show-unsupported --show-xfail")
if (MSVC OR XCODE)
Expand Down Expand Up @@ -189,7 +182,7 @@ function(add_openmp_testsuite target comment)
COMMAND ${PYTHON_EXECUTABLE} ${OPENMP_LLVM_LIT_EXECUTABLE} ${LIT_ARGS} ${ARG_UNPARSED_ARGUMENTS}
COMMENT ${comment}
DEPENDS ${ARG_DEPENDS}
${cmake_3_2_USES_TERMINAL}
USES_TERMINAL
)
else()
if (ARG_EXCLUDE_FROM_CHECK_ALL)
Expand Down
13 changes: 2 additions & 11 deletions libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -137,17 +137,8 @@ find_library (

# There is a libcuda.so in lib64/stubs that can be used for linking.
if (NOT LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES AND CUDA_FOUND)
# Since CMake 3.3 FindCUDA.cmake defaults to using static libraries. In this
# case CUDA_LIBRARIES contains additional linker arguments which breaks
# get_filename_component below. Fortunately, since that change the module
# exports CUDA_cudart_static_LIBRARY which points to a single file in the
# right directory.
set(cuda_library ${CUDA_LIBRARIES})
if (DEFINED CUDA_cudart_static_LIBRARY)
set(cuda_library ${CUDA_cudart_static_LIBRARY})
endif()
get_filename_component(CUDA_LIBDIR ${cuda_library} DIRECTORY)
find_library (
get_filename_component(CUDA_LIBDIR "${CUDA_cudart_static_LIBRARY}" DIRECTORY)
find_library(
LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES
NAMES
cuda
Expand Down
11 changes: 3 additions & 8 deletions libomptarget/deviceRTLs/amdgcn/src/target_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,6 @@

#define WARPSIZE 64

// The named barrier for active parallel threads of a team in an L1 parallel
// region to synchronize with each other.
#define L1_BARRIER (1)

// Maximum number of preallocated arguments to an outlined parallel/simd
// function. Anything more requires dynamic memory allocation.
#define MAX_SHARED_ARGS 20
Expand Down Expand Up @@ -113,10 +109,9 @@ INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
// AMDGCN doesn't need to sync threads in a warp
}

INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) {
// we have protected the master warp from releasing from its barrier
// due to a full workgroup barrier in the middle of a work function.
// So it is ok to issue a full workgroup barrier here.
INLINE void __kmpc_impl_named_sync(uint32_t num_threads) {
(void)num_threads;
// TODO: Implement on top of __SHARED__
__builtin_amdgcn_s_barrier();
}

Expand Down
2 changes: 1 addition & 1 deletion libomptarget/deviceRTLs/common/omptarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ class omptarget_nvptx_TeamDescr {
workDescrForActiveParallel; // one, ONLY for the active par

ALIGN(16)
__kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
__kmpc_data_sharing_worker_slot_static worker_rootS[DS_Max_Warp_Number];
ALIGN(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
};

Expand Down
2 changes: 1 addition & 1 deletion libomptarget/deviceRTLs/common/src/data_sharing.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ INLINE static void data_sharing_init_stack_common() {
omptarget_nvptx_TeamDescr *teamDescr =
&omptarget_nvptx_threadPrivateContext->TeamContext();

for (int WID = 0; WID < WARPSIZE; WID++) {
for (int WID = 0; WID < DS_Max_Warp_Number; WID++) {
__kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
DataSharingState.SlotPtr[WID] = RootS;
DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
Expand Down
3 changes: 1 addition & 2 deletions libomptarget/deviceRTLs/common/src/sync.cu
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,7 @@ EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
PRINT(LD_SYNC,
"call kmpc_barrier with %d omp threads, sync parameter %d\n",
(int)numberOfActiveOMPThreads, (int)threads);
// Barrier #1 is for synchronization among active threads.
__kmpc_impl_named_sync(L1_BARRIER, threads);
__kmpc_impl_named_sync(threads);
}
} else {
// Still need to flush the memory per the standard.
Expand Down
9 changes: 7 additions & 2 deletions libomptarget/deviceRTLs/nvptx/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm})
endforeach()

# Override default MAX_SM in src/target_impl.h if requested
if (DEFINED LIBOMPTARGET_NVPTX_MAX_SM)
set(MAX_SM_DEFINITION "-DMAX_SM=${LIBOMPTARGET_NVPTX_MAX_SM}")
endif()

# Activate RTL message dumps if requested by the user.
set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
"Activate NVPTX device RTL debug messages.")
Expand All @@ -96,7 +101,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
list(APPEND CUDA_NVCC_FLAGS -I${devicertl_base_directory}
-I${devicertl_nvptx_directory}/src)
cuda_add_library(bolt-omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects}
OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG} ${MAX_SM_DEFINITION})

# Install device RTL under the lib destination folder.
install(TARGETS bolt-omptarget-nvptx ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}")
Expand Down Expand Up @@ -172,7 +177,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
get_filename_component(outfile ${src} NAME)

add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc
COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch}
COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} ${MAX_SM_DEFINITION}
-c ${infile} -o ${outfile}-sm_${sm}.bc
DEPENDS ${infile}
IMPLICIT_DEPENDS CXX ${infile}
Expand Down
26 changes: 18 additions & 8 deletions libomptarget/deviceRTLs/nvptx/src/target_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,26 +37,33 @@

#define WARPSIZE 32

// The named barrier for active parallel threads of a team in an L1 parallel
// region to synchronize with each other.
#define L1_BARRIER (1)

// Maximum number of preallocated arguments to an outlined parallel/simd function.
// Anything more requires dynamic memory allocation.
#define MAX_SHARED_ARGS 20

// Maximum number of omp state objects per SM allocated statically in global
// memory.
#if __CUDA_ARCH__ >= 700
#if __CUDA_ARCH__ >= 600
#define OMP_STATE_COUNT 32
#else
#define OMP_STATE_COUNT 16
#endif

#if !defined(MAX_SM)
#if __CUDA_ARCH__ >= 900
#error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option
#elif __CUDA_ARCH__ >= 800
// GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs
// GA102 design has a maxinum of 84 SMs
#define MAX_SM 108
#elif __CUDA_ARCH__ >= 700
#define MAX_SM 84
#elif __CUDA_ARCH__ >= 600
#define OMP_STATE_COUNT 32
#define MAX_SM 56
#else
#define OMP_STATE_COUNT 16
#define MAX_SM 16
#endif
#endif

#define OMP_ACTIVE_PARALLEL_LEVEL 128

Expand Down Expand Up @@ -176,7 +183,10 @@ INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
#endif // CUDA_VERSION
}

INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) {
INLINE void __kmpc_impl_named_sync(uint32_t num_threads) {
// The named barrier for active parallel threads of a team in an L1 parallel
// region to synchronize with each other.
int barrier = 1;
asm volatile("bar.sync %0, %1;"
:
: "r"(barrier), "r"(num_threads)
Expand Down
159 changes: 159 additions & 0 deletions libomptarget/include/Debug.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
//===------- Debug.h - Target independent OpenMP target RTL -- C++ --------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Routines used to provide debug messages and information from libomptarget
// and plugin RTLs to the user.
//
// Each plugin RTL and libomptarget define TARGET_NAME and DEBUG_PREFIX for use
// when sending messages to the user. These indicate which RTL sent the message
//
// Debug and information messages are controlled by the environment variables
// LIBOMPTARGET_DEBUG and LIBOMPTARGET_INFO which is set upon initialization
// of libomptarget or the plugin RTL.
//
// To printf a pointer in hex with a fixed width of 16 digits and a leading 0x,
// use printf("ptr=" DPxMOD "...\n", DPxPTR(ptr));
//
// DPxMOD expands to:
// "0x%0*" PRIxPTR
// where PRIxPTR expands to an appropriate modifier for the type uintptr_t on a
// specific platform, e.g. "lu" if uintptr_t is typedef'd as unsigned long:
// "0x%0*lu"
//
// Ultimately, the whole statement expands to:
// printf("ptr=0x%0*lu...\n", // the 0* modifier expects an extra argument
// // specifying the width of the output
// (int)(2*sizeof(uintptr_t)), // the extra argument specifying the width
// // 8 digits for 32bit systems
// // 16 digits for 64bit
// (uintptr_t) ptr);
//
//===----------------------------------------------------------------------===//
#ifndef _OMPTARGET_DEBUG_H
#define _OMPTARGET_DEBUG_H

static inline int getInfoLevel() {
static int InfoLevel = -1;
if (InfoLevel >= 0)
return InfoLevel;

if (char *EnvStr = getenv("LIBOMPTARGET_INFO"))
InfoLevel = std::stoi(EnvStr);

return InfoLevel;
}

static inline int getDebugLevel() {
static int DebugLevel = -1;
if (DebugLevel >= 0)
return DebugLevel;

if (char *EnvStr = getenv("LIBOMPTARGET_DEBUG"))
DebugLevel = std::stoi(EnvStr);

return DebugLevel;
}

#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#undef __STDC_FORMAT_MACROS

#define DPxMOD "0x%0*" PRIxPTR
#define DPxPTR(ptr) ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(ptr))
#define GETNAME2(name) #name
#define GETNAME(name) GETNAME2(name)

/// Print a generic message string from libomptarget or a plugin RTL
#define MESSAGE0(_str) \
do { \
fprintf(stderr, GETNAME(TARGET_NAME) " message: %s\n", _str); \
} while (0)

/// Print a printf formatting string message from libomptarget or a plugin RTL
#define MESSAGE(_str, ...) \
do { \
fprintf(stderr, GETNAME(TARGET_NAME) " message: " _str "\n", __VA_ARGS__); \
} while (0)

/// Print fatal error message with an error string and error identifier
#define FATAL_MESSAGE0(_num, _str) \
do { \
fprintf(stderr, GETNAME(TARGET_NAME) " fatal error %d: %s\n", _num, _str); \
abort(); \
} while (0)

/// Print fatal error message with a printf string and error identifier
#define FATAL_MESSAGE(_num, _str, ...) \
do { \
fprintf(stderr, GETNAME(TARGET_NAME) " fatal error %d:" _str "\n", _num, \
__VA_ARGS__); \
abort(); \
} while (0)

/// Print a generic error string from libomptarget or a plugin RTL
#define FAILURE_MESSAGE(...) \
do { \
fprintf(stderr, GETNAME(TARGET_NAME) " error: "); \
fprintf(stderr, __VA_ARGS__); \
} while (0)

/// Print a generic information string used if LIBOMPTARGET_INFO=1
#define INFO_MESSAGE(_num, ...) \
do { \
fprintf(stderr, GETNAME(TARGET_NAME) " device %d info: ", _num); \
fprintf(stderr, __VA_ARGS__); \
} while (0)

// Debugging messages
#ifdef OMPTARGET_DEBUG
#include <stdio.h>

#define DEBUGP(prefix, ...) \
{ \
fprintf(stderr, "%s --> ", prefix); \
fprintf(stderr, __VA_ARGS__); \
}

/// Emit a message for debugging
#define DP(...) \
do { \
if (getDebugLevel() > 0) { \
DEBUGP(DEBUG_PREFIX, __VA_ARGS__); \
} \
} while (false)

/// Emit a message for debugging or failure if debugging is disabled
#define REPORT(...) \
do { \
if (getDebugLevel() > 0) { \
DP(__VA_ARGS__); \
} else { \
FAILURE_MESSAGE(__VA_ARGS__); \
} \
} while (false)
#else
#define DEBUGP(prefix, ...) \
{}
#define DP(...) \
{}
#define REPORT(...) FAILURE_MESSAGE(__VA_ARGS__);
#endif // OMPTARGET_DEBUG

/// Emit a message giving the user extra information about the runtime if
#define INFO(_id, ...) \
do { \
if (getDebugLevel() > 0) { \
DEBUGP(DEBUG_PREFIX, __VA_ARGS__); \
} else if (getInfoLevel() > 0) { \
INFO_MESSAGE(_id, __VA_ARGS__); \
} \
} while (false)

#endif // _OMPTARGET_DEBUG_H
40 changes: 0 additions & 40 deletions libomptarget/include/omptarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
#define OFFLOAD_FAIL (~0)

#define OFFLOAD_DEVICE_DEFAULT -1
#define HOST_DEVICE -10

/// Data attributes for each data reference used in an OpenMP target region.
enum tgt_map_type {
Expand Down Expand Up @@ -261,45 +260,6 @@ void __kmpc_push_target_tripcount(int64_t device_id, uint64_t loop_tripcount);
}
#endif

#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif

#include <inttypes.h>
#define DPxMOD "0x%0*" PRIxPTR
#define DPxPTR(ptr) ((int)(2*sizeof(uintptr_t))), ((uintptr_t) (ptr))

/*
* To printf a pointer in hex with a fixed width of 16 digits and a leading 0x,
* use printf("ptr=" DPxMOD "...\n", DPxPTR(ptr));
*
* DPxMOD expands to:
* "0x%0*" PRIxPTR
* where PRIxPTR expands to an appropriate modifier for the type uintptr_t on a
* specific platform, e.g. "lu" if uintptr_t is typedef'd as unsigned long:
* "0x%0*lu"
*
* Ultimately, the whole statement expands to:
* printf("ptr=0x%0*lu...\n", // the 0* modifier expects an extra argument
* // specifying the width of the output
* (int)(2*sizeof(uintptr_t)), // the extra argument specifying the width
* // 8 digits for 32bit systems
* // 16 digits for 64bit
* (uintptr_t) ptr);
*/

#ifdef OMPTARGET_DEBUG
#include <stdio.h>
#define DEBUGP(prefix, ...) \
{ \
fprintf(stderr, "%s --> ", prefix); \
fprintf(stderr, __VA_ARGS__); \
}
#else
#define DEBUGP(prefix, ...) \
{}
#endif

#ifdef __cplusplus
#define EXTERN extern "C"
#else
Expand Down
Loading

0 comments on commit 92ac48c

Please sign in to comment.