diff --git a/.arcconfig b/.arcconfig
deleted file mode 100644
index bd06ac847..000000000
--- a/.arcconfig
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "repository.callsign" : "OMP",
-  "conduit_uri" : "https://reviews.llvm.org/"
-}
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
new file mode 100644
index 000000000..39b5bafef
--- /dev/null
+++ b/.github/workflows/CI.yml
@@ -0,0 +1,29 @@
+name: CI
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  build-bolt:
+
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ ubuntu-latest ]
+        abt: [ yes, no ]
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: 'true'
+    - name: cmake
+      run: |
+        mkdir build
+        cd build
+        cmake ../ -DLIBOMP_USE_ARGOBOTS=${{ matrix.abt }} -DOPENMP_ENABLE_WERROR=TRUE
+    - name: make
+      run: |
+        cd build
+        make -j 2
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..77f211bd4
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "external/argobots"]
+	path = external/argobots
+	url = https://github.com/pmodels/argobots
diff --git a/CHANGES.txt b/CHANGES.txt
new file mode 100644
index 000000000..a2e29d837
--- /dev/null
+++ b/CHANGES.txt
@@ -0,0 +1,50 @@
+===============================================================================
+                               Changes in 1.0
+===============================================================================
+
+- Upgraded to LLVM OpenMP 10.0
+- Upgraded Argobots to 1.0
+- Fixed support for untied tasks
+- Added tests for OpenMP task and thread scheduling
+- Support several platforms including OSX and POWER9.
+
+===============================================================================
+                               Changes in 1.0rc3
+===============================================================================
+
+- Upgraded Argobots to 1.0rc2 to solve the TLS-related issue
+- Fixed support for scheduler sleep
+
+===============================================================================
+                               Changes in 1.0rc2
+===============================================================================
+
+- Upgraded to LLVM OpenMP 9.0
+- Improved the performance of nested parallel regions
+- Support the thread affinity
+
+===============================================================================
+                               Changes in 1.0rc1
+===============================================================================
+
+- Upgraded to LLVM OpenMP 7.0
+- Support task depend and taskloop
+- Support OpenMP 4.5 except untied task and cancellation
+- Argobots updated to the latest version and integrated as a git submodule
+
+===============================================================================
+                               Changes in 1.0b1
+===============================================================================
+
+- Fixed missing some global state initialization
+- Fixed bugs related to newer Perl versions
+- Updated the embedded Argobots version
+
+===============================================================================
+                               Changes in 1.0a1
+===============================================================================
+
+# The first release of BOLT, which uses Argobots as a threading layer.
+
+# Support OpenMP 3.1
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 597eedcec..4d338980f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.13.4)
 
 # Add cmake directory to search for custom cmake functions.
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
@@ -6,7 +6,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
 # llvm/runtimes/ will set OPENMP_STANDALONE_BUILD.
 if (OPENMP_STANDALONE_BUILD OR "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
   set(OPENMP_STANDALONE_BUILD TRUE)
-  project(openmp C CXX)
+  project(bolt C CXX)
 
   # CMAKE_BUILD_TYPE was not set, default to Release.
   if (NOT CMAKE_BUILD_TYPE)
@@ -29,8 +29,17 @@ if (OPENMP_STANDALONE_BUILD OR "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_S
   set(OPENMP_LLVM_TOOLS_DIR "" CACHE PATH "Path to LLVM tools for testing.")
 else()
   set(OPENMP_ENABLE_WERROR ${LLVM_ENABLE_WERROR})
-  # If building in tree, we honor the same install suffix LLVM uses.
-  set(OPENMP_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}")
+  set(LIBOMP_USE_BOLT_DEFAULT FALSE CACHE BOOL "Use BOLT as a default LLVM OpenMP?")
+  if (${LIBOMP_USE_BOLT_DEFAULT})
+    # If building in tree, we honor the same install suffix LLVM uses.
+    set(OPENMP_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}")
+  else()
+    # If building in tree, we put BOLT libraries in a special directory
+    set(OPENMP_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}/bolt")
+  endif()
+  # Place libraries in "lib/bolt"
+  set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}/bolt)
+  set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/bolt)
 
   if (NOT MSVC)
     set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang)
@@ -39,6 +48,8 @@ else()
     set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang.exe)
     set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++.exe)
   endif()
+
+  list(APPEND LIBOMPTARGET_LLVM_INCLUDE_DIRS ${LLVM_MAIN_INCLUDE_DIR} ${LLVM_BINARY_DIR}/include)
 endif()
 
 # Check and set up common compiler flags.
@@ -53,6 +64,8 @@ set(OPENMP_TEST_FLAGS "" CACHE STRING
 set(OPENMP_TEST_OPENMP_FLAGS ${OPENMP_TEST_COMPILER_OPENMP_FLAGS} CACHE STRING
   "OpenMP compiler flag to use for testing OpenMP runtime libraries.")
 
+# Build external libraries.
+add_subdirectory(external)
 
 # Build host runtime library.
 add_subdirectory(runtime)
@@ -62,22 +75,50 @@ set(ENABLE_LIBOMPTARGET ON)
 # Currently libomptarget cannot be compiled on Windows or MacOS X.
 # Since the device plugins are only supported on Linux anyway,
 # there is no point in trying to compile libomptarget on other OSes.
-if (APPLE OR WIN32 OR NOT OPENMP_HAVE_STD_CPP11_FLAG)
+if (APPLE OR WIN32 OR NOT OPENMP_HAVE_STD_CPP14_FLAG)
+  set(ENABLE_LIBOMPTARGET OFF)
+endif()
+
+# Attempt to locate LLVM source, required by libomptarget
+if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
+  if (LLVM_MAIN_INCLUDE_DIR)
+    list(APPEND LIBOMPTARGET_LLVM_INCLUDE_DIRS ${LLVM_MAIN_INCLUDE_DIR})
+  elseif (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/../llvm/include)
+    list(APPEND LIBOMPTARGET_LLVM_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/../llvm/include)
+  endif()
+endif()
+
+if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
+  message(STATUS "Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS, disabling libomptarget")
   set(ENABLE_LIBOMPTARGET OFF)
 endif()
 
 option(OPENMP_ENABLE_LIBOMPTARGET "Enable building libomptarget for offloading."
        ${ENABLE_LIBOMPTARGET})
+option(OPENMP_ENABLE_LIBOMPTARGET_PROFILING "Enable time profiling for libomptarget."
+       ${ENABLE_LIBOMPTARGET})
 if (OPENMP_ENABLE_LIBOMPTARGET)
-  # Check that the library can acutally be built.
+  # Check that the library can actually be built.
   if (APPLE OR WIN32)
     message(FATAL_ERROR "libomptarget cannot be built on Windows and MacOS X!")
-  elseif (NOT OPENMP_HAVE_STD_CPP11_FLAG)
-    message(FATAL_ERROR "Host compiler must support C++11 to build libomptarget!")
+  elseif (NOT OPENMP_HAVE_STD_CPP14_FLAG)
+    message(FATAL_ERROR "Host compiler must support C++14 to build libomptarget!")
   endif()
 
   add_subdirectory(libomptarget)
 endif()
 
-# Now that we have seen all testuites, create the check-openmp target.
+set(ENABLE_OMPT_TOOLS ON)
+# Currently tools are not tested well on Windows or MacOS X.
+if (APPLE OR WIN32)
+  set(ENABLE_OMPT_TOOLS OFF)
+endif()
+
+option(OPENMP_ENABLE_OMPT_TOOLS "Enable building ompt based tools for OpenMP."
+       ${ENABLE_OMPT_TOOLS})
+if (OPENMP_ENABLE_OMPT_TOOLS)
+  add_subdirectory(tools)
+endif()
+
+# Now that we have seen all testsuites, create the check-openmp target.
 construct_check_openmp_target()
diff --git a/CREDITS.txt b/CREDITS.txt
index b14bb9a1e..ede45b10f 100644
--- a/CREDITS.txt
+++ b/CREDITS.txt
@@ -53,6 +53,10 @@ N: Steven Noonan
 E: steven@uplinklabs.net
 D: Patches for the ARM architecture and removal of several inconsistencies.
 
+N: Joachim Protze
+E: protze@itc.rwth-aachen.de
+D: OpenMP Tools Interface, Archer tool
+
 N: Alp Toker
 E: alp@nuanti.com
 D: Making build work for FreeBSD.
diff --git a/LICENSE.txt b/LICENSE.txt
index 990756638..2153a1bab 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,3 +1,39 @@
+==============================================================================
+BOLT is a derivative of the Intel OpenMP runtime.  The original pieces of the
+code from the Intel OpenMP runtime are copyrighted to Intel, and the pieces
+modified for BOLT are copyrighted to UChicago Argonne, LLC.
+==============================================================================
+                   Copyright (c) 2016, UChicago Argonne, LLC
+                             All Rights Reserved
+               BOLT: OpenMP over Lightweight Threads, SF-16-140
+                             OPEN SOURCE LICENSE
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.  Software changes,
+   modifications, or derivative works, should be noted with comments and the
+   author and organization's name.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the names of UChicago Argonne, LLC or the Department of Energy nor
+   the names of its contributors may be used to endorse or promote products
+   derived from this software without specific prior written permission.
+4. The software and the end-user documentation included with the
+   redistribution, if any, must include the following acknowledgment:
+   "This product includes software produced by UChicago Argonne, LLC under
+   Contract No. DE-AC02-06CH11357 with the Department of Energy."
+******************************************************************************
+                                  DISCLAIMER
+         THE SOFTWARE IS SUPPLIED "AS IS" WITHOUT WARRANTY OF ANY KIND.
+NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT OF
+ENERGY, NOR UCHICAGO ARGONNE, LLC, NOR ANY OF THEIR EMPLOYEES, MAKES ANY
+WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY
+FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, DATA,
+APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT
+INFRINGE PRIVATELY OWNED RIGHTS.
+******************************************************************************
+
 ==============================================================================
 The LLVM Project is under the Apache License v2.0 with LLVM Exceptions:
 ==============================================================================
diff --git a/README.md b/README.md
new file mode 100644
index 000000000..6776e7710
--- /dev/null
+++ b/README.md
@@ -0,0 +1,263 @@
+# BOLT: OpenMP over Lightweight Threads
+
+BOLT targets a high-performing OpenMP implementation, especially specialized
+for fine-grain parallelism.  BOLT utilizes a lightweight threading model for
+its underlying threading mechanism.  It currently adopts Argobots, a new
+holistic, low-level threading and tasking runtime, in order to overcome
+shortcomings of conventional OS-level threads.  The current BOLT implementation
+is based on the OpenMP runtime in LLVM, and thus it can be used with
+LLVM/Clang, Intel OpenMP compiler, and GCC.  More information about BOLT can be
+found at http://www.bolt-omp.org.
+
+
+1. Getting Started
+2. Testing BOLT
+3. BOLT-Specific Environmental Variables
+4. Reporting Problems
+5. Alternate Build Options
+
+
+-------------------------------------------------------------------------------
+
+1. Getting Started
+==================
+
+The following instructions take you through a sequence of steps to get the
+default configuration of BOLT up and running.
+
+Henceforth, VERSION indicates the version number of the release tarball.
+
+(a) You will need the following prerequisites.
+
+    - REQUIRED: This tar file bolt-VERSION.tar.gz
+
+    - REQUIRED: C and C++ compilers (gcc and g++ are sufficient)
+
+    - REQUIRED: CMake (http://www.cmake.org/download)
+
+    - OPTIONAL: Argobots (http://www.argobots.org)
+                The BOLT release tarball includes the Argobots source code, and
+                thus you can build BOLT together with the built-in Argobots.
+                Of course, you can use your own Argobots build instead of the
+                accompanied one.  In the latter case, we assume Argobots has
+                been installed in /home/USERNAME/argobots-install.
+
+  Also, you need to know what shell you are using since different shell has
+  different command syntax.  Command "echo $SHELL" prints out the current shell
+  used by your terminal program.
+
+  Note: if you obtained BOLT via github, the following commands download the
+  built-in Argobots from the Argobots repository.
+
+    git submodule init
+    git submodule update
+
+(b) Unpack the tar file and create a build directory:
+
+    tar xzf bolt-VERSION.tar.gz
+    mkdir bolt-build
+    cd bolt-build
+
+  If your tar doesn't accept the z option, use
+
+    gunzip bolt-VERSION.tar.gz
+    tar xf bolt-VERSION.tar
+    mkdir bolt-build
+    cd bolt-build
+
+(c) Choose an installation directory, say /home/USERNAME/bolt-install, which is
+assumed to be non-existent or empty.
+
+(d) Configure BOLT specifying the installation directory:
+
+  If you want to use the built-in Argobots,
+
+    for csh and tcsh:
+
+      cmake ../bolt-VERSION -G "Unix Makefiles" \
+          -DCMAKE_INSTALL_PREFIX=/home/USERNAME/bolt-install \
+          -DCMAKE_C_COMPILER=<C compiler> \
+          -DCMAKE_CXX_COMPILER=<C++ compiler> \
+          -DOPENMP_TEST_C_COMPILER=<C compiler for testing> \
+          -DOPENMP_TEST_CXX_COMPILER=<C++ compiler for testing> \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DLIBOMP_USE_ARGOBOTS=on \
+          |& tee c.txt
+
+    for bash and sh:
+
+      cmake ../bolt-VERSION -G "Unix Makefiles" \
+          -DCMAKE_INSTALL_PREFIX=/home/USERNAME/bolt-install \
+          -DCMAKE_C_COMPILER=<C compiler> \
+          -DCMAKE_CXX_COMPILER=<C++ compiler> \
+          -DOPENMP_TEST_C_COMPILER=<C compiler for testing> \
+          -DOPENMP_TEST_CXX_COMPILER=<C++ compiler for testing> \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DLIBOMP_USE_ARGOBOTS=on \
+          2>&1 | tee c.txt
+
+  If you want to use your own Argobots build,
+
+    for csh and tcsh:
+
+      cmake ../bolt-VERSION -G "Unix Makefiles" \
+          -DCMAKE_INSTALL_PREFIX=/home/USERNAME/bolt-install \
+          -DCMAKE_C_COMPILER=<C compiler> \
+          -DCMAKE_CXX_COMPILER=<C++ compiler> \
+          -DOPENMP_TEST_C_COMPILER=<C compiler for testing> \
+          -DOPENMP_TEST_CXX_COMPILER=<C++ compiler for testing> \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DLIBOMP_USE_ARGOBOTS=on \
+          -DLIBOMP_ARGOBOTS_INSTALL_DIR=/home/USERNAME/argobots-install \
+          |& tee c.txt
+
+    for bash and sh:
+
+      cmake ../bolt-VERSION -G "Unix Makefiles" \
+          -DCMAKE_INSTALL_PREFIX=/home/USERNAME/bolt-install \
+          -DCMAKE_C_COMPILER=<C compiler> \
+          -DCMAKE_CXX_COMPILER=<C++ compiler> \
+          -DOPENMP_TEST_C_COMPILER=<C compiler for testing> \
+          -DOPENMP_TEST_CXX_COMPILER=<C++ compiler for testing> \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DLIBOMP_USE_ARGOBOTS=on \
+          -DLIBOMP_ARGOBOTS_INSTALL_DIR=/home/USERNAME/argobots-install \
+          2>&1 | tee c.txt
+
+  Bourne-like shells, sh and bash, accept "2>&1 |".  Csh-like shell, csh and
+  tcsh, accept "|&".  If a failure occurs, the cmake command will display the
+  error.  Most errors are straight-forward to follow.
+
+(e) Build BOLT:
+
+    for csh and tcsh:
+
+      make |& tee m.txt
+
+    for bash and sh:
+
+      make 2>&1 | tee m.txt
+
+  This step should succeed if there were no problems with the preceding step.
+  Check file m.txt.  If there were problems, do a "make clean" and then run
+  make again with V=1 and VERBOSE=1.
+
+    make V=1 VERBOSE=1 |& tee m.txt       (for csh and tcsh)
+
+    OR
+
+    make V=1 VERBOSE=1 2>&1 | tee m.txt   (for bash and sh)
+
+  Then go to step 3 below, for reporting the issue to the BOLT developers and
+  other users.
+
+(f) Install BOLT:
+
+    for csh and tcsh:
+
+      make install |& tee mi.txt
+
+    for bash and sh:
+
+      make install 2>&1 | tee mi.txt
+
+  This step collects all required header and library files in the directory
+  specified by the prefix argument to cmake.
+
+-------------------------------------------------------------------------------
+
+2. Testing BOLT
+===============
+
+To test BOLT, you can run the test suite.  Compilers for testing must be
+specified when you run cmake.
+
+For example, if llvm-lit is installed:
+
+    cd bolt-build
+    NUM_PARALLEL_TESTS=16
+    llvm-lit runtime/test -v -j $NUM_PARALLEL_TESTS --timeout 600
+
+If you run into any problems on running the test suite, please follow step 3
+below for reporting them to the BOLT developers and other users.
+
+-------------------------------------------------------------------------------
+
+3. BOLT-Specific Environmental Variables
+===============
+
+BOLT reveals several environmental variables specific to BOLT.
+
+    KMP_ABT_NUM_ESS=<int>: Set the number of execution streams which are
+                           running on OS-level threads (e.g., Pthreads).
+    KMP_ABT_SCHED_SLEEP=<1|0>: If it is set to 1, sleep a scheduler when the
+                               associate pools are empty.
+    KMP_ABT_VERBOSE=<1|0>: If it is set to 1, print all the BOLT-specific
+                           parameters on runtime initialization.
+    KMP_ABT_FORK_CUTOFF=<int>: Set the cut-off threshold used for a
+                               divide-and-conquer thread creation.
+    KMP_ABT_FORK_NUM_WAYS=<int>: Set the number of ways for a
+                                 divide-and-conquer thread creation.
+    KMP_ABT_SCHED_MIN_SLEEP_NSEC=<int>: Set the minimum scheduler sleep time
+                                        (nanoseconds).
+    KMP_ABT_SCHED_MAX_SLEEP_NSEC=<int>: Set the maximum scheduler sleep time
+                                        (nanoseconds).
+    KMP_ABT_SCHED_EVENT_FREQ=<int>: Set the event-checking frequency of
+                                    schedulers.
+    KMP_ABT_WORK_STEAL_FREQ=<int>: Set the random work stealing frequency of
+                                   schedulers.
+
+-------------------------------------------------------------------------------
+
+4. Reporting Problems
+=====================
+
+If you have problems with the installation or usage of BOLT, please follow
+these steps:
+
+(a) First visit the Frequently Asked Questions (FAQ) page at
+https://github.com/pmodels/bolt/wiki/FAQ
+to see if the problem you are facing has a simple solution.
+
+(b) If you cannot find an answer on the FAQ page, look through previous
+email threads on the discuss@bolt-omp.org mailing list archive
+(https://lists.bolt-omp.org/mailman/listinfo/discuss).  It is likely
+someone else had a similar problem, which has already been resolved
+before.
+
+(c) If neither of the above steps work, please send an email to
+discuss@bolt-omp.org.  You need to subscribe to this list
+(https://lists.bolt-omp.org/mailman/listinfo/discuss) before sending
+an email.
+
+Your email should contain the following files.  ONCE AGAIN, PLEASE
+COMPRESS BEFORE SENDING, AS THE FILES CAN BE LARGE.  Note that,
+depending on which step the build failed, some of the files might not
+exist.
+
+    bolt-build/c.txt (generated in step 1(d) above)
+    bolt-build/m.txt (generated in step 1(e) above)
+    bolt-build/mi.txt (generated in step 1(f) above)
+
+    DID WE MENTION? DO NOT FORGET TO COMPRESS THESE FILES!
+
+Finally, please include the actual error you are seeing when running
+the application.  If possible, please try to reproduce the error with
+a smaller application or benchmark and send that along in your bug
+report.
+
+(d) If you have found a bug in BOLT, we request that you report it
+at our github issues page (https://github.com/pmodels/bolt/issues).
+Even if you believe you have found a bug, we recommend you sending an
+email to discuss@bolt-omp.org first.
+
+-------------------------------------------------------------------------------
+
+5. Alternate Build Options
+==============================
+
+BOLT is based on the OpenMP subproject of LLVM for runtime, and thus it uses
+the same build options provided in LLVM.
+
+Please visit http://openmp.llvm.org/ for more build options.
+
diff --git a/README.rst b/README.rst
index 7f747caf9..e46ed1a1a 100644
--- a/README.rst
+++ b/README.rst
@@ -4,7 +4,7 @@ How to Build the LLVM* OpenMP* Libraries
 This repository requires `CMake <http://www.cmake.org/>`_ v2.8.0 or later.  LLVM
 and Clang need a more recent version which also applies for in-tree builds.  For
 more information than available in this document please see
-`LLVM's CMake documentation <http://llvm.org/docs/CMake.html>`_ and the
+`LLVM's CMake documentation <https://llvm.org/docs/CMake.html>`_ and the
 `official documentation <https://cmake.org/cmake/help/v2.8.0/cmake.html>`_.
 
 .. contents::
@@ -130,6 +130,10 @@ Options for all Libraries
   Specify full path to ``FileCheck`` executable for running tests.  The default
   is to search the ``PATH`` and the directory in **OPENMP_LLVM_TOOLS_DIR**.
 
+**OPENMP_NOT_EXECUTABLE** = ``/path/to/not``
+  Specify full path to ``not`` executable for running tests.  The default
+  is to search the ``PATH`` and the directory in **OPENMP_LLVM_TOOLS_DIR**.
+
 Options for ``libomp``
 ----------------------
 
@@ -187,7 +191,7 @@ Optional Features
   Align certain data structures on 4096-byte.  This option is useful on
   multi-node systems where a small ``CACHE_LINE`` setting leads to false sharing.
 
-**LIBOMP_OMPT_SUPPORT** = ``ON|OFF``
+**LIBBOLT_OMPT_SUPPORT** = ``ON|OFF``
   Include support for the OpenMP Tools Interface (OMPT).
   This option is supported and ``ON`` by default for x86, x86_64, AArch64,
   PPC64 and RISCV64 on Linux* and macOS*.
@@ -195,7 +199,7 @@ Optional Features
 
 **LIBOMP_OMPT_OPTIONAL** = ``ON|OFF``
   Include support for optional OMPT functionality.  This option is ignored if
-  **LIBOMP_OMPT_SUPPORT** is ``OFF``.
+  **LIBBOLT_OMPT_SUPPORT** is ``OFF``.
 
 **LIBOMP_STATS** = ``OFF|ON``
   Include stats-gathering code.
diff --git a/cmake/DetectTestCompiler/CMakeLists.txt b/cmake/DetectTestCompiler/CMakeLists.txt
index 1fd7cc715..7fa32a909 100644
--- a/cmake/DetectTestCompiler/CMakeLists.txt
+++ b/cmake/DetectTestCompiler/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.13.4)
 project(DetectTestCompiler C CXX)
 
 include(CheckCCompilerFlag)
@@ -9,6 +9,7 @@ function(write_compiler_information lang)
   set(information "${information}\\;${CMAKE_${lang}_COMPILER_ID}")
   set(information "${information}\\;${CMAKE_${lang}_COMPILER_VERSION}")
   set(information "${information}\\;${${lang}_FLAGS}")
+  set(information "${information}\\;${${lang}_HAS_TSAN_FLAG}")
   file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${lang}CompilerInformation.txt ${information})
 endfunction(write_compiler_information)
 
@@ -39,5 +40,9 @@ if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
   add_experimental_isel_flag(CXX)
 endif()
 
+SET(CMAKE_REQUIRED_FLAGS "-fsanitize=thread")
+check_c_compiler_flag("" C_HAS_TSAN_FLAG)
+check_cxx_compiler_flag("" CXX_HAS_TSAN_FLAG)
+
 write_compiler_information(C)
 write_compiler_information(CXX)
diff --git a/cmake/HandleOpenMPOptions.cmake b/cmake/HandleOpenMPOptions.cmake
index eb7b286b3..15382bcf1 100644
--- a/cmake/HandleOpenMPOptions.cmake
+++ b/cmake/HandleOpenMPOptions.cmake
@@ -29,7 +29,7 @@ append_if(OPENMP_HAVE_WNO_EXTRA_FLAG "-Wno-extra" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_WNO_PEDANTIC_FLAG "-Wno-pedantic" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_WNO_MAYBE_UNINITIALIZED_FLAG "-Wno-maybe-uninitialized" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 
-append_if(OPENMP_HAVE_STD_GNUPP11_FLAG "-std=gnu++11" CMAKE_CXX_FLAGS)
-if (NOT OPENMP_HAVE_STD_GNUPP11_FLAG)
-  append_if(OPENMP_HAVE_STD_CPP11_FLAG "-std=c++11" CMAKE_CXX_FLAGS)
+append_if(OPENMP_HAVE_STD_GNUPP14_FLAG "-std=gnu++14" CMAKE_CXX_FLAGS)
+if (NOT OPENMP_HAVE_STD_GNUPP14_FLAG)
+  append_if(OPENMP_HAVE_STD_CPP14_FLAG "-std=c++14" CMAKE_CXX_FLAGS)
 endif()
diff --git a/cmake/OpenMPTesting.cmake b/cmake/OpenMPTesting.cmake
index 52e68aa15..40238227a 100644
--- a/cmake/OpenMPTesting.cmake
+++ b/cmake/OpenMPTesting.cmake
@@ -1,5 +1,5 @@
 # Keep track if we have all dependencies.
-set(ENABLE_CHECK_TARGETS TRUE)
+set(ENABLE_CHECK_BOLT_TARGETS TRUE)
 
 # Function to find required dependencies for testing.
 function(find_standalone_test_dependencies)
@@ -8,7 +8,7 @@ function(find_standalone_test_dependencies)
   if (NOT PYTHONINTERP_FOUND)
     message(STATUS "Could not find Python.")
     message(WARNING "The check targets will not be available!")
-    set(ENABLE_CHECK_TARGETS FALSE PARENT_SCOPE)
+    set(ENABLE_CHECK_BOLT_TARGETS FALSE PARENT_SCOPE)
     return()
   endif()
 
@@ -20,7 +20,7 @@ function(find_standalone_test_dependencies)
     message(STATUS "Cannot find llvm-lit.")
     message(STATUS "Please put llvm-lit in your PATH, set OPENMP_LLVM_LIT_EXECUTABLE to its full path, or point OPENMP_LLVM_TOOLS_DIR to its directory.")
     message(WARNING "The check targets will not be available!")
-    set(ENABLE_CHECK_TARGETS FALSE PARENT_SCOPE)
+    set(ENABLE_CHECK_BOLT_TARGETS FALSE PARENT_SCOPE)
     return()
   endif()
 
@@ -31,7 +31,18 @@ function(find_standalone_test_dependencies)
     message(STATUS "Cannot find FileCheck.")
     message(STATUS "Please put FileCheck in your PATH, set OPENMP_FILECHECK_EXECUTABLE to its full path, or point OPENMP_LLVM_TOOLS_DIR to its directory.")
     message(WARNING "The check targets will not be available!")
-    set(ENABLE_CHECK_TARGETS FALSE PARENT_SCOPE)
+    set(ENABLE_CHECK_BOLT_TARGETS FALSE PARENT_SCOPE)
+    return()
+  endif()
+
+  find_program(OPENMP_NOT_EXECUTABLE
+    NAMES not
+    PATHS ${OPENMP_LLVM_TOOLS_DIR})
+  if (NOT OPENMP_NOT_EXECUTABLE)
+    message(STATUS "Cannot find 'not'.")
+    message(STATUS "Please put 'not' in your PATH, set OPENMP_NOT_EXECUTABLE to its full path, or point OPENMP_LLVM_TOOLS_DIR to its directory.")
+    message(WARNING "The check targets will not be available!")
+    set(ENABLE_CHECK_BOLT_TARGETS FALSE PARENT_SCOPE)
     return()
   endif()
 endfunction()
@@ -39,13 +50,6 @@ endfunction()
 if (${OPENMP_STANDALONE_BUILD})
   find_standalone_test_dependencies()
 
-  # Make sure we can use the console pool for recent CMake and Ninja > 1.5.
-  if (CMAKE_VERSION VERSION_LESS 3.1.20141117)
-    set(cmake_3_2_USES_TERMINAL)
-  else()
-    set(cmake_3_2_USES_TERMINAL USES_TERMINAL)
-  endif()
-
   # Set lit arguments.
   set(DEFAULT_LIT_ARGS "-sv --show-unsupported --show-xfail")
   if (MSVC OR XCODE)
@@ -55,6 +59,7 @@ if (${OPENMP_STANDALONE_BUILD})
   separate_arguments(OPENMP_LIT_ARGS)
 else()
   set(OPENMP_FILECHECK_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/FileCheck)
+  set(OPENMP_NOT_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/not)
 endif()
 
 # Macro to extract information about compiler from file. (no own scope)
@@ -64,11 +69,13 @@ macro(extract_test_compiler_information lang file)
   list(GET information 1 id)
   list(GET information 2 version)
   list(GET information 3 openmp_flags)
+  list(GET information 4 has_tsan_flags)
 
   set(OPENMP_TEST_${lang}_COMPILER_PATH ${path})
   set(OPENMP_TEST_${lang}_COMPILER_ID ${id})
   set(OPENMP_TEST_${lang}_COMPILER_VERSION ${version})
   set(OPENMP_TEST_${lang}_COMPILER_OPENMP_FLAGS ${openmp_flags})
+  set(OPENMP_TEST_${lang}_COMPILER_HAS_TSAN_FLAGS ${has_tsan_flags})
 endmacro()
 
 # Function to set variables with information about the test compiler.
@@ -79,11 +86,12 @@ function(set_test_compiler_information dir)
           "${OPENMP_TEST_C_COMPILER_VERSION}" STREQUAL "${OPENMP_TEST_CXX_COMPILER_VERSION}"))
     message(STATUS "Test compilers for C and C++ don't match.")
     message(WARNING "The check targets will not be available!")
-    set(ENABLE_CHECK_TARGETS FALSE PARENT_SCOPE)
+    set(ENABLE_CHECK_BOLT_TARGETS FALSE PARENT_SCOPE)
   else()
     set(OPENMP_TEST_COMPILER_ID "${OPENMP_TEST_C_COMPILER_ID}" PARENT_SCOPE)
     set(OPENMP_TEST_COMPILER_VERSION "${OPENMP_TEST_C_COMPILER_VERSION}" PARENT_SCOPE)
     set(OPENMP_TEST_COMPILER_OPENMP_FLAGS "${OPENMP_TEST_C_COMPILER_OPENMP_FLAGS}" PARENT_SCOPE)
+    set(OPENMP_TEST_COMPILER_HAS_TSAN_FLAGS "${OPENMP_TEST_C_COMPILER_HAS_TSAN_FLAGS}" PARENT_SCOPE)
 
     # Determine major version.
     string(REGEX MATCH "[0-9]+" major "${OPENMP_TEST_C_COMPILER_VERSION}")
@@ -109,7 +117,7 @@ if (${OPENMP_STANDALONE_BUILD})
   if (DETECT_COMPILER_RESULT)
     message(STATUS "Could not detect test compilers.")
     message(WARNING "The check targets will not be available!")
-    set(ENABLE_CHECK_TARGETS FALSE)
+    set(ENABLE_CHECK_BOLT_TARGETS FALSE)
   else()
     set_test_compiler_information(${CMAKE_CURRENT_BINARY_DIR}/DetectTestCompiler)
   endif()
@@ -118,8 +126,8 @@ else()
   set(OPENMP_TEST_COMPILER_ID "Clang")
   # Cannot use CLANG_VERSION because we are not guaranteed that this is already set.
   set(OPENMP_TEST_COMPILER_VERSION "${LLVM_VERSION}")
-  set(OPENMP_TEST_COMPILER_VERSION_MAJOR "${LLVM_MAJOR_VERSION}")
-  set(OPENMP_TEST_COMPILER_VERSION_MAJOR_MINOR "${LLVM_MAJOR_VERSION}.${LLVM_MINOR_VERSION}")
+  set(OPENMP_TEST_COMPILER_VERSION_MAJOR "${LLVM_VERSION_MAJOR}")
+  set(OPENMP_TEST_COMPILER_VERSION_MAJOR_MINOR "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}")
   # Unfortunately the top-level cmake/config-ix.cmake file mangles CMake's
   # CMAKE_THREAD_LIBS_INIT variable from the FindThreads package, so work
   # around that, until it is fixed there.
@@ -128,6 +136,11 @@ else()
   else()
     set(OPENMP_TEST_COMPILER_THREAD_FLAGS "${CMAKE_THREAD_LIBS_INIT}")
   endif()
+  if(TARGET tsan)
+    set(OPENMP_TEST_COMPILER_HAS_TSAN_FLAGS 1)
+  else()
+    set(OPENMP_TEST_COMPILER_HAS_TSAN_FLAGS 0)
+  endif()
   # TODO: Implement blockaddress in GlobalISel and remove this flag!
   set(OPENMP_TEST_COMPILER_OPENMP_FLAGS "-fopenmp ${OPENMP_TEST_COMPILER_THREAD_FLAGS} -fno-experimental-isel")
 endif()
@@ -148,16 +161,16 @@ set_test_compiler_features()
 
 # Function to add a testsuite for an OpenMP runtime library.
 function(add_openmp_testsuite target comment)
-  if (NOT ENABLE_CHECK_TARGETS)
+  if (NOT ENABLE_CHECK_BOLT_TARGETS)
     add_custom_target(${target}
       COMMAND ${CMAKE_COMMAND} -E echo "${target} does nothing, dependencies not found.")
     message(STATUS "${target} does nothing.")
     return()
   endif()
 
-  cmake_parse_arguments(ARG "" "" "DEPENDS;ARGS" ${ARGN})
-  # EXCLUDE_FROM_ALL excludes the test ${target} out of check-openmp.
-  if (NOT EXCLUDE_FROM_ALL)
+  cmake_parse_arguments(ARG "EXCLUDE_FROM_CHECK_ALL" "" "DEPENDS;ARGS" ${ARGN})
+  # EXCLUDE_FROM_CHECK_ALL excludes the test ${target} out of check-openmp.
+  if (NOT ARG_EXCLUDE_FROM_CHECK_ALL)
     # Register the testsuites and depends for the check-openmp rule.
     set_property(GLOBAL APPEND PROPERTY OPENMP_LIT_TESTSUITES ${ARG_UNPARSED_ARGUMENTS})
     set_property(GLOBAL APPEND PROPERTY OPENMP_LIT_DEPENDS ${ARG_DEPENDS})
@@ -169,15 +182,25 @@ function(add_openmp_testsuite target comment)
       COMMAND ${PYTHON_EXECUTABLE} ${OPENMP_LLVM_LIT_EXECUTABLE} ${LIT_ARGS} ${ARG_UNPARSED_ARGUMENTS}
       COMMENT ${comment}
       DEPENDS ${ARG_DEPENDS}
-      ${cmake_3_2_USES_TERMINAL}
+      USES_TERMINAL
     )
   else()
-    add_lit_testsuite(${target}
-      ${comment}
-      ${ARG_UNPARSED_ARGUMENTS}
-      DEPENDS clang clang-resource-headers FileCheck ${ARG_DEPENDS}
-      ARGS ${ARG_ARGS}
-    )
+    if (ARG_EXCLUDE_FROM_CHECK_ALL)
+      add_lit_testsuite(${target}
+        ${comment}
+        ${ARG_UNPARSED_ARGUMENTS}
+        EXCLUDE_FROM_CHECK_ALL
+        DEPENDS clang FileCheck ${ARG_DEPENDS}
+        ARGS ${ARG_ARGS}
+      )
+    else()
+      add_lit_testsuite(${target}
+        ${comment}
+        ${ARG_UNPARSED_ARGUMENTS}
+        DEPENDS clang FileCheck ${ARG_DEPENDS}
+        ARGS ${ARG_ARGS}
+      )
+    endif()
   endif()
 endfunction()
 
@@ -186,6 +209,5 @@ function(construct_check_openmp_target)
   get_property(OPENMP_LIT_DEPENDS GLOBAL PROPERTY OPENMP_LIT_DEPENDS)
 
   # We already added the testsuites themselves, no need to do that again.
-  set(EXCLUDE_FROM_ALL True)
-  add_openmp_testsuite(check-openmp "Running OpenMP tests" ${OPENMP_LIT_TESTSUITES} DEPENDS ${OPENMP_LIT_DEPENDS})
+  add_openmp_testsuite(check-bolt-openmp "Running BOLT tests" ${OPENMP_LIT_TESTSUITES} EXCLUDE_FROM_CHECK_ALL DEPENDS ${OPENMP_LIT_DEPENDS})
 endfunction()
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index ebb2530ae..d9ea3bbb0 100644
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -14,5 +14,5 @@ check_cxx_compiler_flag(-Wno-extra OPENMP_HAVE_WNO_EXTRA_FLAG)
 check_cxx_compiler_flag(-Wno-pedantic OPENMP_HAVE_WNO_PEDANTIC_FLAG)
 check_cxx_compiler_flag(-Wno-maybe-uninitialized OPENMP_HAVE_WNO_MAYBE_UNINITIALIZED_FLAG)
 
-check_cxx_compiler_flag(-std=gnu++11 OPENMP_HAVE_STD_GNUPP11_FLAG)
-check_cxx_compiler_flag(-std=c++11 OPENMP_HAVE_STD_CPP11_FLAG)
+check_cxx_compiler_flag(-std=gnu++14 OPENMP_HAVE_STD_GNUPP14_FLAG)
+check_cxx_compiler_flag(-std=c++14 OPENMP_HAVE_STD_CPP14_FLAG)
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
new file mode 100644
index 000000000..b7f2ec422
--- /dev/null
+++ b/docs/ReleaseNotes.rst
@@ -0,0 +1,45 @@
+===========================
+openmp 11.0.0 Release Notes
+===========================
+
+.. contents::
+    :local:
+
+.. warning::
+   These are in-progress notes for the upcoming LLVM 11.0.0 release.
+   Release notes for previous releases can be found on
+   `the Download Page <https://releases.llvm.org/download.html>`_.
+
+Introduction
+============
+
+This document contains the release notes for the OpenMP runtime, release 11.0.0.
+Here we describe the status of openmp, including major improvements
+from the previous release. All openmp releases may be downloaded
+from the `LLVM releases web site <https://llvm.org/releases/>`_.
+
+Non-comprehensive list of changes in this release
+=================================================
+
+5.0 features
+------------
+
+* ...
+
+5.1 features
+------------
+
+* ...
+
+OMPT Improvements
+-----------------
+
+* Added OMPT callbacks for doacross loops, detached tasks
+* Added handling for mutexinoutset dependencies
+
+OMPT-based Tools
+----------------
+
+* Added ompt-multiplex.h as a header-only OMPT-tool to support nesting of OMPT
+  tools. (see openmp/tools/multiplex)
+
diff --git a/examples/sample_nested.c b/examples/sample_nested.c
new file mode 100644
index 000000000..c3479b034
--- /dev/null
+++ b/examples/sample_nested.c
@@ -0,0 +1,53 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+
+/*
+ * See LICENSE.txt in top-level directory.
+ */
+
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+int main(int argc, char *argv[]) {
+
+  int size = (argc > 1) ? atoi(argv[1]) : 100;
+  int i, j;
+  int nthreads;
+  struct timeval t_start, t_end;
+  double time;
+
+  double *a = (double *)malloc(sizeof(double) * size * size);
+
+#pragma omp parallel
+  { nthreads = omp_get_num_threads(); }
+
+  for (i = 0; i < size * size; i++) {
+    a[i] = i;
+  }
+
+  gettimeofday(&t_start, NULL);
+
+#pragma omp parallel for
+  for (i = 0; i < size; i++) {
+#pragma omp parallel for
+    for (j = 0; j < size; j++) {
+      a[i * size + j] = a[i * size + j] * 0.9;
+    }
+  }
+
+  gettimeofday(&t_end, NULL);
+
+  time = (t_end.tv_sec * 1000000 + t_end.tv_usec) -
+         (t_start.tv_sec * 1000000 + t_start.tv_usec);
+
+  printf("%d %f\n", nthreads, time / 1000000.0);
+
+  for (i = 0; i < size * size; i++) {
+    if (a[i] != i * 0.9) {
+      printf("a[%d]=%f\n", i, a[i]);
+      return 1;
+    }
+  }
+  free(a);
+}
diff --git a/examples/sample_task_multiple_producer.c b/examples/sample_task_multiple_producer.c
new file mode 100644
index 000000000..f540b015f
--- /dev/null
+++ b/examples/sample_task_multiple_producer.c
@@ -0,0 +1,52 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+
+/*
+ * See LICENSE.txt in top-level directory.
+ */
+
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+int main(int argc, char *argv[]) {
+
+  int i, num = (argc > 1) ? atoi(argv[1]) : 100;
+  int nthreads;
+  struct timeval t_start, t_end;
+  double time;
+  double *a = (double *)malloc(sizeof(double) * num);
+
+#pragma omp parallel
+  { nthreads = omp_get_num_threads(); }
+
+  for (i = 0; i < num; i++) {
+    a[i] = i;
+  }
+
+  gettimeofday(&t_start, NULL);
+
+#pragma omp parallel
+  {
+#pragma omp for
+    for (i = 0; i < num; i++) {
+#pragma omp task
+      { a[i] *= 0.9; }
+    }
+  }
+
+  gettimeofday(&t_end, NULL);
+
+  time = (t_end.tv_sec * 1000000 + t_end.tv_usec) -
+         (t_start.tv_sec * 1000000 + t_start.tv_usec);
+
+  printf("%d %f\n", nthreads, time / 1000000.0);
+
+  for (i = 0; i < num; i++) {
+    if (a[i] != i * 0.9) {
+      printf("a[%d]=%f != %f\n", i, a[i], i * 0.9);
+      return 1;
+    }
+  }
+  free(a);
+}
diff --git a/examples/sample_task_single_producer.c b/examples/sample_task_single_producer.c
new file mode 100644
index 000000000..6ddc00d50
--- /dev/null
+++ b/examples/sample_task_single_producer.c
@@ -0,0 +1,54 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+
+/*
+ * See LICENSE.txt in top-level directory.
+ */
+
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+int main(int argc, char *argv[]) {
+
+  int i, num = (argc > 1) ? atoi(argv[1]) : 100;
+  int nthreads;
+  struct timeval t_start, t_end;
+  double time;
+  double *a = (double *)malloc(sizeof(double) * num);
+
+#pragma omp parallel
+  { nthreads = omp_get_num_threads(); }
+
+  for (i = 0; i < num; i++) {
+    a[i] = i;
+  }
+
+  gettimeofday(&t_start, NULL);
+
+#pragma omp parallel
+  {
+#pragma omp single
+    {
+      for (i = 0; i < num; i++) {
+#pragma omp task
+        { a[i] *= 0.9; }
+      }
+    }
+  }
+
+  gettimeofday(&t_end, NULL);
+
+  time = (t_end.tv_sec * 1000000 + t_end.tv_usec) -
+         (t_start.tv_sec * 1000000 + t_start.tv_usec);
+
+  printf("%d %f\n", nthreads, time / 1000000.0);
+
+  for (i = 0; i < num; i++) {
+    if (a[i] != i * 0.9) {
+      printf("a[%d]=%f != %f\n", i, a[i], i * 0.9);
+      return 1;
+    }
+  }
+  free(a);
+}
diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt
new file mode 100644
index 000000000..fe263709a
--- /dev/null
+++ b/external/CMakeLists.txt
@@ -0,0 +1,87 @@
+# -*- Mode: CMakeLists.txt; -*-
+#
+# See LICENSE.txt in top-level directory.
+#
+
+cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
+
+# Check if the built-in Argobots needs to be used
+set(LIBOMP_USE_ARGOBOTS FALSE CACHE BOOL
+  "Use Argobots (http://www.argobots.org) as threading model?")
+if(${LIBOMP_USE_ARGOBOTS} AND (NOT LIBOMP_ARGOBOTS_INSTALL_DIR))
+  set(LIBOMP_USE_BUILTIN_ARGOBOTS TRUE CACHE BOOL "")
+else()
+  set(LIBOMP_USE_BUILTIN_ARGOBOTS FALSE CACHE BOOL "")
+endif()
+
+if(${LIBOMP_USE_BUILTIN_ARGOBOTS})
+  # Check if the built-in Argobots exists.
+  if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/argobots/configure.ac")
+    if (EXISTS "${PROJECT_SOURCE_DIR}/.git")
+      message(STATUS "Running `git submodule update --init`")
+      execute_process(COMMAND git submodule update --init
+                      WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+                      RESULT_VARIABLE GIT_SUBMOD_RESULT)
+      if (NOT GIT_SUBMOD_RESULT EQUAL "0")
+        message(STATUS "git submodule update --init failed with `${GIT_SUBMOD_RESULT}`. ")
+      else()
+        message(STATUS "Running `git submodule update --init` - Success")
+      endif()
+    endif()
+    if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/argobots/configure.ac")
+        message(FATAL_ERROR "The built-in Argobots is not found. "
+                            "Please run `git submodule update --init` if you get BOLT via GitHub, or "
+                            "download Argobots manually and place it at `external/argobots`.")
+    endif()
+  endif()
+  # Use the built-in Argobots
+  set(ABT_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/argobots)
+  set(ABT_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/argobots)
+  set(ABT_INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/argobots/install)
+  set(ABT_AUTOGEN ${ABT_SRC_DIR}/autogen.sh)
+  set(ABT_CONFIGURE ${ABT_SRC_DIR}/configure)
+  if ((NOT EXISTS ${ABT_CONFIGURE}) OR (${ABT_AUTOGEN} IS_NEWER_THAN ${ABT_CONFIGURE}))
+    execute_process(
+      COMMAND ./autogen.sh
+      WORKING_DIRECTORY ${ABT_SRC_DIR}
+    )
+  endif()
+
+  include(ExternalProject)
+  ExternalProject_Add(libabt
+    SOURCE_DIR ${ABT_SRC_DIR}
+    BINARY_DIR ${ABT_BUILD_DIR}
+    CONFIGURE_COMMAND ${ABT_CONFIGURE} --prefix=${ABT_INSTALL_DIR} CC=${CMAKE_C_COMPILER}
+  )
+  # Install libraries.
+  if(${OPENMP_STANDALONE_BUILD})
+    set(ABT_LIBS_INSTALL_PATH ${CMAKE_INSTALL_PREFIX}/lib)
+  else()
+    set(ABT_LIBS_INSTALL_PATH ${OPENMP_INSTALL_LIBDIR})
+  endif()
+  # pkgconfig is not installed.
+  install(DIRECTORY ${ABT_INSTALL_DIR}/lib/
+          DESTINATION ${ABT_LIBS_INSTALL_PATH}
+          PATTERN pkgconfig EXCLUDE)
+  # Install headers.  pkgconfig is not installed.
+  if(${OPENMP_STANDALONE_BUILD})
+    set(ABT_HEADERS_INSTALL_PATH ${CMAKE_INSTALL_PREFIX}/include)
+  else()
+    string(REGEX MATCH "[0-9]+\\.[0-9]+(\\.[0-9]+)?" CLANG_VERSION ${PACKAGE_VERSION})
+    set(ABT_HEADERS_INSTALL_PATH "${OPENMP_INSTALL_LIBDIR}/clang/${CLANG_VERSION}/include")
+  endif()
+  install(FILES ${ABT_INSTALL_DIR}/include/abt.h
+          DESTINATION ${ABT_HEADERS_INSTALL_PATH})
+
+  set(LIBOMP_ARGOBOTS_INSTALL_DIR ${ABT_INSTALL_DIR} PARENT_SCOPE)
+else()
+  set(LIBOMP_ARGOBOTS_INSTALL_DIR /usr/local CACHE PATH
+    "Install path for Argobots")
+endif()
+
+if(${LIBOMP_USE_ARGOBOTS})
+  # Need to be python-bool.
+  set(LIBOMP_TEST_USE_ARGOBOTS True PARENT_SCOPE)
+else()
+  set(LIBOMP_TEST_USE_ARGOBOTS False PARENT_SCOPE)
+endif()
diff --git a/external/argobots b/external/argobots
new file mode 160000
index 000000000..b8786cbd1
--- /dev/null
+++ b/external/argobots
@@ -0,0 +1 @@
+Subproject commit b8786cbd1462e2c28e8e33b6a933962fa4710c0b
diff --git a/libomptarget/CMakeLists.txt b/libomptarget/CMakeLists.txt
index a953662bf..44925f1db 100644
--- a/libomptarget/CMakeLists.txt
+++ b/libomptarget/CMakeLists.txt
@@ -1,9 +1,9 @@
 ##===----------------------------------------------------------------------===##
-# 
+#
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# 
+#
 ##===----------------------------------------------------------------------===##
 #
 # Build offloading library and related plugins.
@@ -17,11 +17,12 @@ endif()
 # Add cmake directory to search for custom cmake functions.
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules ${CMAKE_MODULE_PATH})
 
-if(OPENMP_STANDALONE_BUILD)
-  # Build all libraries into a common place so that tests can find them.
-  set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-  set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-endif()
+# Set the path of all resulting libraries to a unified location so that it can
+# be used for testing.
+set(LIBOMPTARGET_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LIBOMPTARGET_LIBRARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LIBOMPTARGET_LIBRARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LIBOMPTARGET_LIBRARY_DIR})
 
 # Message utilities.
 include(LibomptargetUtils)
@@ -29,6 +30,11 @@ include(LibomptargetUtils)
 # Get dependencies for the different components of the project.
 include(LibomptargetGetDependencies)
 
+# LLVM source tree is required at build time for libomptarget
+if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
+  message(FATAL_ERROR "Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
+endif()
+
 # This is a list of all the targets that are supported/tested right now.
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} aarch64-unknown-linux-gnu")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64le-ibm-linux-gnu")
@@ -38,7 +44,8 @@ set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda")
 
 # Once the plugins for the different targets are validated, they will be added to
 # the list of supported targets in the current system.
-set (LIBOMPTARGET_SYSTEM_TARGETS "")
+set (LIBBOLTTARGET_SYSTEM_TARGETS "")
+set (LIBBOLTTARGET_TESTED_PLUGINS "")
 
 # Check whether using debug mode. In debug mode, allow dumping progress
 # messages at runtime by default. Otherwise, it can be enabled
@@ -53,17 +60,12 @@ if(LIBOMPTARGET_ENABLE_DEBUG)
   add_definitions(-DOMPTARGET_DEBUG)
 endif()
 
-include_directories(include)
+set(LIBOMPTARGET_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
+include_directories(${LIBOMPTARGET_INCLUDE_DIR})
 
 # Build target agnostic offloading library.
-add_subdirectory(src)
-
-# Retrieve the path to the resulting library so that it can be used for 
-# testing.
-get_target_property(LIBOMPTARGET_LIBRARY_DIR omptarget LIBRARY_OUTPUT_DIRECTORY)
-if(NOT LIBOMPTARGET_LIBRARY_DIR)
-  set(LIBOMPTARGET_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
-endif()
+set(LIBOMPTARGET_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
+add_subdirectory(${LIBOMPTARGET_SRC_DIR})
 
 # Definitions for testing, for reuse when testing libomptarget-nvptx.
 if(OPENMP_STANDALONE_BUILD)
diff --git a/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake b/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
index dbf8c381d..f6bfadcaf 100644
--- a/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
+++ b/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -15,6 +15,7 @@
 # libffi : required to launch target kernels given function and argument 
 #          pointers.
 # CUDA : required to control offloading to NVIDIA GPUs.
+# VEOS : required to control offloading to NEC Aurora.
 
 include (FindPackageHandleStandardArgs)
 
@@ -116,6 +117,18 @@ if (CUDA_TOOLKIT_ROOT_DIR)
 endif()
 find_package(CUDA QUIET)
 
+# Try to get the highest Nvidia GPU architecture the system supports
+if (CUDA_FOUND)
+  cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS)
+  string(REGEX MATCH "sm_([0-9]+)" CUDA_ARCH_MATCH_OUTPUT ${CUDA_ARCH_FLAGS})
+  if (NOT DEFINED CUDA_ARCH_MATCH_OUTPUT OR "${CMAKE_MATCH_1}" LESS 35)
+    libomptarget_warning_say("Setting Nvidia GPU architecture support for OpenMP target runtime library to sm_35 by default")
+    set(LIBOMPTARGET_DEP_CUDA_ARCH "35")
+  else()
+    set(LIBOMPTARGET_DEP_CUDA_ARCH "${CMAKE_MATCH_1}")
+  endif()
+endif()
+
 set(LIBOMPTARGET_DEP_CUDA_FOUND ${CUDA_FOUND})
 set(LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS})
 
@@ -136,17 +149,8 @@ find_library (
 
 # There is a libcuda.so in lib64/stubs that can be used for linking.
 if (NOT LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES AND CUDA_FOUND)
-  # Since CMake 3.3 FindCUDA.cmake defaults to using static libraries. In this
-  # case CUDA_LIBRARIES contains additional linker arguments which breaks
-  # get_filename_component below. Fortunately, since that change the module
-  # exports CUDA_cudart_static_LIBRARY which points to a single file in the
-  # right directory.
-  set(cuda_library ${CUDA_LIBRARIES})
-  if (DEFINED CUDA_cudart_static_LIBRARY)
-    set(cuda_library ${CUDA_cudart_static_LIBRARY})
-  endif()
-  get_filename_component(CUDA_LIBDIR ${cuda_library} DIRECTORY)
-  find_library (
+  get_filename_component(CUDA_LIBDIR "${CUDA_cudart_static_LIBRARY}" DIRECTORY)
+  find_library(
       LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES
     NAMES
       cuda
@@ -162,6 +166,61 @@ find_package_handle_standard_args(
 mark_as_advanced(LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES)
 
 ################################################################################
+# Looking for VEO...
+################################################################################
+
+find_path (
+  LIBOMPTARGET_DEP_VEO_INCLUDE_DIR
+  NAMES
+    ve_offload.h
+  PATHS
+    /usr/include
+    /usr/local/include
+    /opt/local/include
+    /sw/include
+    /opt/nec/ve/veos/include
+    ENV CPATH
+  PATH_SUFFIXES
+    libveo)
+
+find_library (
+  LIBOMPTARGET_DEP_VEO_LIBRARIES
+  NAMES
+    veo
+  PATHS
+    /usr/lib
+    /usr/local/lib
+    /opt/local/lib
+    /sw/lib
+    /opt/nec/ve/veos/lib64
+    ENV LIBRARY_PATH
+    ENV LD_LIBRARY_PATH)
+
+find_library(
+  LIBOMPTARGET_DEP_VEOSINFO_LIBRARIES
+  NAMES
+    veosinfo
+  PATHS
+    /usr/lib
+    /usr/local/lib
+    /opt/local/lib
+    /sw/lib
+    /opt/nec/ve/veos/lib64
+    ENV LIBRARY_PATH
+    ENV LD_LIBRARY_PATH)
+
+set(LIBOMPTARGET_DEP_VEO_INCLUDE_DIRS ${LIBOMPTARGET_DEP_VEO_INCLUDE_DIR})
+find_package_handle_standard_args(
+  LIBOMPTARGET_DEP_VEO
+  DEFAULT_MSG
+  LIBOMPTARGET_DEP_VEO_LIBRARIES
+  LIBOMPTARGET_DEP_VEOSINFO_LIBRARIES
+  LIBOMPTARGET_DEP_VEO_INCLUDE_DIRS)
+
+mark_as_advanced(
+  LIBOMPTARGET_DEP_VEO_FOUND
+  LIBOMPTARGET_DEP_VEO_INCLUDE_DIRS)
+
 # Looking for CUDA libdevice subdirectory
 #
 # Special case for Debian/Ubuntu to have nvidia-cuda-toolkit work
diff --git a/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake b/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
index 538bf9277..6ec0cc2b6 100644
--- a/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
+++ b/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
@@ -78,7 +78,7 @@ endfunction()
 
 # These flags are required to emit LLVM Bitcode. We check them together because
 # if any of them are not supported, there is no point in finding out which are.
-set(compiler_flags_required -emit-llvm -O1 --cuda-device-only -std=c++11 --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
+set(compiler_flags_required -emit-llvm -O1 --cuda-device-only -std=c++14 --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
 set(compiler_flags_required_src "extern \"C\" __device__ int thread() { return threadIdx.x; }")
 check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED "${compiler_flags_required_src}" ${compiler_flags_required})
 
diff --git a/libomptarget/cmake/Modules/LibomptargetUtils.cmake b/libomptarget/cmake/Modules/LibomptargetUtils.cmake
index 7339cc0b5..2836c53c0 100644
--- a/libomptarget/cmake/Modules/LibomptargetUtils.cmake
+++ b/libomptarget/cmake/Modules/LibomptargetUtils.cmake
@@ -11,17 +11,17 @@
 # void libomptarget_say(string message_to_user);
 # - prints out message_to_user
 macro(libomptarget_say message_to_user)
-  message(STATUS "LIBOMPTARGET: ${message_to_user}")
+  message(STATUS "BOLT-LIBOMPTARGET: ${message_to_user}")
 endmacro()
 
 # void libomptarget_warning_say(string message_to_user);
 # - prints out message_to_user with a warning
 macro(libomptarget_warning_say message_to_user)
-  message(WARNING "LIBOMPTARGET: ${message_to_user}")
+  message(WARNING "BOLT-LIBOMPTARGET: ${message_to_user}")
 endmacro()
 
 # void libomptarget_error_say(string message_to_user);
 # - prints out message_to_user with an error and exits cmake
 macro(libomptarget_error_say message_to_user)
-  message(FATAL_ERROR "LIBOMPTARGET: ${message_to_user}")
+  message(FATAL_ERROR "BOLT-LIBOMPTARGET: ${message_to_user}")
 endmacro()
diff --git a/libomptarget/deviceRTLs/CMakeLists.txt b/libomptarget/deviceRTLs/CMakeLists.txt
index 9723fb8cd..3df94eac0 100644
--- a/libomptarget/deviceRTLs/CMakeLists.txt
+++ b/libomptarget/deviceRTLs/CMakeLists.txt
@@ -6,7 +6,7 @@
 #
 # ##===----------------------------------------------------------------------===##
 #
-# Build a device RTL for each available machine available.
+# Build a device RTL for each available machine.
 #
 ##===----------------------------------------------------------------------===##
 
diff --git a/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
new file mode 100644
index 000000000..a9f81c37e
--- /dev/null
+++ b/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
@@ -0,0 +1,152 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build the AMDGCN Device RTL if the ROCM tools are available
+#
+##===----------------------------------------------------------------------===##
+
+find_package(LLVM QUIET CONFIG
+  PATHS
+  $ENV{AOMP}
+  $ENV{HOME}/rocm/aomp
+  /opt/rocm/aomp
+  /usr/lib/rocm/aomp
+  ${LIBOMPTARGET_NVPTX_CUDA_COMPILER_DIR}
+  ${LIBOMPTARGET_NVPTX_CUDA_LINKER_DIR}
+  ${CMAKE_CXX_COMPILER_DIR}
+  NO_DEFAULT_PATH)
+
+if (LLVM_DIR)
+  libomptarget_say("Found LLVM ${LLVM_PACKAGE_VERSION}. Configure: ${LLVM_DIR}/LLVMConfig.cmake")
+else()
+  libomptarget_say("Not building AMDGCN device RTL: AOMP not found")
+  return()
+endif()
+
+set(AOMP_INSTALL_PREFIX ${LLVM_INSTALL_PREFIX})
+
+if (AOMP_INSTALL_PREFIX)
+  set(AOMP_BINDIR ${AOMP_INSTALL_PREFIX}/bin)
+else()
+  set(AOMP_BINDIR ${LLVM_BUILD_BINARY_DIR}/bin)
+endif()
+
+libomptarget_say("Building AMDGCN device RTL. LLVM_COMPILER_PATH=${AOMP_BINDIR}")
+
+project(bolt-omptarget-amdgcn)
+
+add_custom_target(bolt-omptarget-amdgcn ALL)
+
+#optimization level
+set(optimization_level 2)
+
+# Activate RTL message dumps if requested by the user.
+if(LIBOMPTARGET_NVPTX_DEBUG)
+  set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1)
+endif()
+
+get_filename_component(devicertl_base_directory
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  DIRECTORY)
+
+set(cuda_sources
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_smid.hip
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_locks.hip
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.hip
+  ${devicertl_base_directory}/common/src/cancel.cu
+  ${devicertl_base_directory}/common/src/critical.cu
+  ${devicertl_base_directory}/common/src/data_sharing.cu
+  ${devicertl_base_directory}/common/src/libcall.cu
+  ${devicertl_base_directory}/common/src/loop.cu
+  ${devicertl_base_directory}/common/src/omp_data.cu
+  ${devicertl_base_directory}/common/src/omptarget.cu
+  ${devicertl_base_directory}/common/src/parallel.cu
+  ${devicertl_base_directory}/common/src/reduction.cu
+  ${devicertl_base_directory}/common/src/support.cu
+  ${devicertl_base_directory}/common/src/sync.cu
+  ${devicertl_base_directory}/common/src/task.cu)
+
+set(h_files
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_interface.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.h
+  ${devicertl_base_directory}/common/debug.h
+  ${devicertl_base_directory}/common/device_environment.h
+  ${devicertl_base_directory}/common/omptarget.h
+  ${devicertl_base_directory}/common/omptargeti.h
+  ${devicertl_base_directory}/common/state-queue.h
+  ${devicertl_base_directory}/common/state-queuei.h
+  ${devicertl_base_directory}/common/support.h)
+
+# for both in-tree and out-of-tree build
+if (NOT CMAKE_ARCHIVE_OUTPUT_DIRECTORY)
+  set(OUTPUTDIR ${CMAKE_CURRENT_BINARY_DIR})
+else()
+  set(OUTPUTDIR ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY})
+endif()
+
+# create libraries
+set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900)
+if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST)
+  set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST})
+endif()
+
+macro(add_cuda_bc_library)
+  set(cu_cmd ${AOMP_BINDIR}/clang++
+    -std=c++14
+    -fcuda-rdc
+    -fvisibility=default
+    --cuda-device-only
+    -Wno-unused-value
+    -x hip
+    -nogpulib -nogpuinc
+    -O${optimization_level}
+    --cuda-gpu-arch=${mcpu}
+    ${CUDA_DEBUG}
+    -I${CMAKE_CURRENT_SOURCE_DIR}/src
+    -I${devicertl_base_directory})
+
+  set(bc1_files)
+
+  foreach(file ${ARGN})
+    get_filename_component(fname ${file} NAME_WE)
+    set(bc1_filename ${fname}.${mcpu}.bc)
+
+    add_custom_command(
+      OUTPUT ${bc1_filename}
+      COMMAND ${cu_cmd} ${file} -o ${bc1_filename}
+      DEPENDS ${file} ${h_files})
+
+    list(APPEND bc1_files ${bc1_filename})
+  endforeach()
+
+  add_custom_command(
+    OUTPUT linkout.cuda.${mcpu}.bc
+    COMMAND ${AOMP_BINDIR}/llvm-link ${bc1_files} -o linkout.cuda.${mcpu}.bc
+    DEPENDS ${bc1_files})
+
+  list(APPEND bc_files linkout.cuda.${mcpu}.bc)
+endmacro()
+
+set(libname "bolt-omptarget-amdgcn")
+
+foreach(mcpu ${mcpus})
+  set(bc_files)
+  add_cuda_bc_library(${cuda_sources})
+
+  set(bc_libname lib${libname}-${mcpu}.bc)
+  add_custom_command(
+    OUTPUT ${bc_libname}
+    COMMAND ${AOMP_BINDIR}/llvm-link ${bc_files} | ${AOMP_BINDIR}/opt --always-inline -o ${OUTPUTDIR}/${bc_libname}
+    DEPENDS ${bc_files})
+
+  add_custom_target(lib${libname}-${mcpu} ALL DEPENDS ${bc_libname})
+
+  install(FILES ${OUTPUTDIR}/${bc_libname}
+     DESTINATION "${OPENMP_INSTALL_LIBDIR}/libdevice"
+  )
+endforeach()
diff --git a/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h b/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
new file mode 100644
index 000000000..80409d611
--- /dev/null
+++ b/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
@@ -0,0 +1,20 @@
+//===--- amdgcn_interface.h - OpenMP interface definitions ------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _AMDGCN_INTERFACE_H_
+#define _AMDGCN_INTERFACE_H_
+
+#include <stdint.h>
+
+#define EXTERN extern "C" __attribute__((device))
+typedef uint64_t __kmpc_impl_lanemask_t;
+typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
+
+EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads();
+
+#endif
diff --git a/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip b/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip
new file mode 100644
index 000000000..f537fb283
--- /dev/null
+++ b/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip
@@ -0,0 +1,31 @@
+//===-- amdgcn_locks.hip - AMDGCN OpenMP GPU lock implementation -- HIP -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A 'thread' maps onto a lane of the wavefront. This means a per-thread lock
+// cannot be implemented - if one thread gets the lock, it can't continue on to
+// the next instruction in order to do anything as the other threads are waiting
+// to take the lock.
+// These functions will be implemented to provide the documented semantics for
+// a SIMD => wavefront mapping once that is implemented.
+//
+//===----------------------------------------------------------------------===//
+#pragma omp declare target
+
+#include "common/debug.h"
+
+static DEVICE void warn() {
+  PRINT0(LD_ALL, "Locks are not supported in this thread mapping model");
+}
+
+DEVICE void __kmpc_impl_init_lock(omp_lock_t *) { warn(); }
+DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); }
+DEVICE void __kmpc_impl_set_lock(omp_lock_t *) { warn(); }
+DEVICE void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); }
+DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) { warn(); }
+
+#pragma omp end declare target
diff --git a/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip b/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip
new file mode 100644
index 000000000..c85045570
--- /dev/null
+++ b/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip
@@ -0,0 +1,64 @@
+//===-------- amdgcn_smid.hip - AMDGCN smid implementation -------- HIP -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#pragma omp declare target
+
+#include "target_impl.h"
+
+// Partially derived fom hcc_detail/device_functions.h
+
+// HW_ID Register bit structure
+// WAVE_ID     3:0     Wave buffer slot number. 0-9.
+// SIMD_ID     5:4     SIMD which the wave is assigned to within the CU.
+// PIPE_ID     7:6     Pipeline from which the wave was dispatched.
+// CU_ID       11:8    Compute Unit the wave is assigned to.
+// SH_ID       12      Shader Array (within an SE) the wave is assigned to.
+// SE_ID       14:13   Shader Engine the wave is assigned to.
+// TG_ID       19:16   Thread-group ID
+// VM_ID       23:20   Virtual Memory ID
+// QUEUE_ID    26:24   Queue from which this wave was dispatched.
+// STATE_ID    29:27   State ID (graphics only, not compute).
+// ME_ID       31:30   Micro-engine ID.
+
+enum {
+  HW_ID = 4, // specify that the hardware register to read is HW_ID
+
+  HW_ID_CU_ID_SIZE = 4,   // size of CU_ID field in bits
+  HW_ID_CU_ID_OFFSET = 8, // offset of CU_ID from start of register
+
+  HW_ID_SE_ID_SIZE = 2,    // sizeof SE_ID field in bits
+  HW_ID_SE_ID_OFFSET = 13, // offset of SE_ID from start of register
+};
+
+// The s_getreg_b32 instruction, exposed as an intrinsic, takes a 16 bit
+// immediate and returns a 32 bit value.
+// The encoding of the immediate parameter is:
+// ID           5:0     Which register to read from
+// OFFSET       10:6    Range: 0..31
+// WIDTH        15:11   Range: 1..32
+
+// The asm equivalent is s_getreg_b32 %0, hwreg(HW_REG_HW_ID, Offset, Width)
+// where hwreg forms a 16 bit immediate encoded by the assembler thus:
+// uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) {
+//   return (Id << 0_) | (Offset << 6) | ((Width - 1) << 11);
+// }
+#define ENCODE_HWREG(WIDTH, OFF, REG) (REG | (OFF << 6) | ((WIDTH - 1) << 11))
+
+// Note: The results can be changed by a context switch
+// Return value in [0 2^SE_ID_SIZE * 2^CU_ID_SIZE), which is an upper
+// bound on how many compute units are available. Some values in this
+// range may never be returned if there are fewer than 2^CU_ID_SIZE CUs.
+
+DEVICE uint32_t __kmpc_impl_smid() {
+  uint32_t cu_id = __builtin_amdgcn_s_getreg(
+      ENCODE_HWREG(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
+  uint32_t se_id = __builtin_amdgcn_s_getreg(
+      ENCODE_HWREG(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
+  return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
+}
+
+#pragma omp end declare target
diff --git a/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
new file mode 100644
index 000000000..6e8a651bd
--- /dev/null
+++ b/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -0,0 +1,171 @@
+//===------- target_impl.h - AMDGCN OpenMP GPU implementation ----- HIP -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Declarations and definitions of target specific functions and constants
+//
+//===----------------------------------------------------------------------===//
+#ifndef OMPTARGET_AMDGCN_TARGET_IMPL_H
+#define OMPTARGET_AMDGCN_TARGET_IMPL_H
+
+#ifndef __AMDGCN__
+#error "amdgcn target_impl.h expects to be compiled under __AMDGCN__"
+#endif
+
+#include "amdgcn_interface.h"
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#define DEVICE __attribute__((device))
+#define INLINE inline DEVICE
+#define NOINLINE __attribute__((noinline)) DEVICE
+#define SHARED __attribute__((shared))
+#define ALIGN(N) __attribute__((aligned(N)))
+
+////////////////////////////////////////////////////////////////////////////////
+// Kernel options
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// The following def must match the absolute limit hardwired in the host RTL
+// max number of threads per team
+#define MAX_THREADS_PER_TEAM 1024
+
+#define WARPSIZE 64
+
+// Maximum number of preallocated arguments to an outlined parallel/simd
+// function. Anything more requires dynamic memory allocation.
+#define MAX_SHARED_ARGS 20
+
+// Maximum number of omp state objects per SM allocated statically in global
+// memory.
+#define OMP_STATE_COUNT 32
+#define MAX_SM 64
+
+#define OMP_ACTIVE_PARALLEL_LEVEL 128
+
+// Data sharing related quantities, need to match what is used in the compiler.
+enum DATA_SHARING_SIZES {
+  // The maximum number of workers in a kernel.
+  DS_Max_Worker_Threads = 960,
+  // The size reserved for data in a shared memory slot.
+  DS_Slot_Size = 256,
+  // The slot size that should be reserved for a working warp.
+  DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
+  // The maximum number of warps in use
+  DS_Max_Warp_Number = 16,
+};
+
+enum : __kmpc_impl_lanemask_t {
+  __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
+};
+
+INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
+  lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
+  hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
+}
+
+INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
+  return (((uint64_t)hi) << 32) | (uint64_t)lo;
+}
+
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt();
+DEVICE uint32_t __kmpc_impl_smid();
+DEVICE double __kmpc_impl_get_wtick();
+DEVICE double __kmpc_impl_get_wtime();
+
+INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
+INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
+
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();
+
+DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var,
+                                     int32_t SrcLane);
+
+DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var,
+                                          uint32_t Delta, int32_t Width);
+
+INLINE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); }
+
+INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
+  // AMDGCN doesn't need to sync threads in a warp
+}
+
+// AMDGCN specific kernel initialization
+DEVICE void __kmpc_impl_target_init();
+
+// Equivalent to ptx bar.sync 1. Barrier until num_threads arrive.
+DEVICE void __kmpc_impl_named_sync(uint32_t num_threads);
+
+INLINE void __kmpc_impl_threadfence() {
+  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
+}
+
+INLINE void __kmpc_impl_threadfence_block() {
+  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
+}
+
+INLINE void __kmpc_impl_threadfence_system() {
+  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
+}
+
+// Calls to the AMDGCN layer (assuming 1D layout)
+INLINE int GetThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); }
+INLINE int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); }
+DEVICE int GetNumberOfBlocksInKernel();
+DEVICE int GetNumberOfThreadsInBlock();
+DEVICE unsigned GetWarpId();
+DEVICE unsigned GetLaneId();
+
+// Atomics
+template <typename T> INLINE T __kmpc_atomic_add(T *address, T val) {
+  return __atomic_fetch_add(address, val, __ATOMIC_SEQ_CST);
+}
+
+INLINE uint32_t __kmpc_atomic_inc(uint32_t *address, uint32_t max) {
+  return __builtin_amdgcn_atomic_inc32(address, max, __ATOMIC_SEQ_CST, "");
+}
+
+template <typename T> INLINE T __kmpc_atomic_max(T *address, T val) {
+  return __atomic_fetch_max(address, val, __ATOMIC_SEQ_CST);
+}
+
+template <typename T> INLINE T __kmpc_atomic_exchange(T *address, T val) {
+  T r;
+  __atomic_exchange(address, &val, &r, __ATOMIC_SEQ_CST);
+  return r;
+}
+
+template <typename T> INLINE T __kmpc_atomic_cas(T *address, T compare, T val) {
+  (void)__atomic_compare_exchange(address, &compare, &val, false,
+                                  __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return compare;
+}
+
+// Locks
+DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);
+DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock);
+DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock);
+DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock);
+DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock);
+
+// Memory
+DEVICE void *__kmpc_impl_malloc(size_t x);
+DEVICE void __kmpc_impl_free(void *x);
+
+// DEVICE versions of part of libc
+INLINE void __assert_fail(const char *, const char *, unsigned int,
+                          const char *) {
+  __builtin_trap();
+}
+EXTERN int printf(const char *, ...);
+
+#endif
diff --git a/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
new file mode 100644
index 000000000..7388a2921
--- /dev/null
+++ b/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -0,0 +1,156 @@
+//===------- target_impl.hip - AMDGCN OpenMP GPU implementation --- HIP -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definitions of target specific functions
+//
+//===----------------------------------------------------------------------===//
+#pragma omp declare target
+
+#include "target_impl.h"
+
+// Implementations initially derived from hcc
+
+// Initialized with a 64-bit mask with bits set in positions less than the
+// thread's lane number in the warp
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
+  uint32_t lane = GetLaneId();
+  int64_t ballot = __kmpc_impl_activemask();
+  uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
+  return mask & ballot;
+}
+
+// Initialized with a 64-bit mask with bits set in positions greater than the
+// thread's lane number in the warp
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
+  uint32_t lane = GetLaneId();
+  if (lane == (WARPSIZE - 1))
+    return 0;
+  uint64_t ballot = __kmpc_impl_activemask();
+  uint64_t mask = (~((uint64_t)0)) << (lane + 1);
+  return mask & ballot;
+}
+
+DEVICE double __kmpc_impl_get_wtick() { return ((double)1E-9); }
+
+DEVICE double __kmpc_impl_get_wtime() {
+  // The intrinsics for measuring time have undocumented frequency
+  // This will probably need to be found by measurement on a number of
+  // architectures. Until then, return 0, which is very inaccurate as a
+  // timer but resolves the undefined symbol at link time.
+  return 0;
+}
+
+// Warp vote function
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
+  return __builtin_amdgcn_read_exec();
+}
+
+DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var,
+                                     int32_t srcLane) {
+  int width = WARPSIZE;
+  int self = GetLaneId();
+  int index = srcLane + (self & ~(width - 1));
+  return __builtin_amdgcn_ds_bpermute(index << 2, var);
+}
+
+DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
+                                          uint32_t laneDelta, int32_t width) {
+  int self = GetLaneId();
+  int index = self + laneDelta;
+  index = (int)(laneDelta + (self & (width - 1))) >= width ? self : index;
+  return __builtin_amdgcn_ds_bpermute(index << 2, var);
+}
+
+static DEVICE SHARED uint32_t L1_Barrier;
+
+DEVICE void __kmpc_impl_target_init() {
+  // Don't have global ctors, and shared memory is not zero init
+  __atomic_store_n(&L1_Barrier, 0u, __ATOMIC_RELEASE);
+}
+
+DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
+  __atomic_thread_fence(__ATOMIC_ACQUIRE);
+
+  uint32_t num_waves = num_threads / WARPSIZE;
+
+  // Partial barrier implementation for amdgcn.
+  // Uses two 16 bit unsigned counters. One for the number of waves to have
+  // reached the barrier, and one to count how many times the barrier has been
+  // passed. These are packed in a single atomically accessed 32 bit integer.
+  // Low bits for the number of waves, assumed zero before this call.
+  // High bits to count the number of times the barrier has been passed.
+
+  assert(num_waves != 0);
+  assert(num_waves * WARPSIZE == num_threads);
+  assert(num_waves < 0xffffu);
+
+  // Increment the low 16 bits once, using the lowest active thread.
+  uint64_t lowestActiveThread = __kmpc_impl_ffs(__kmpc_impl_activemask()) - 1;
+  bool isLowest = GetLaneId() == lowestActiveThread;
+
+  if (isLowest) {
+    uint32_t load =
+        __atomic_fetch_add(&L1_Barrier, 1, __ATOMIC_RELAXED); // commutative
+
+    // Record the number of times the barrier has been passed
+    uint32_t generation = load & 0xffff0000u;
+
+    if ((load & 0x0000ffffu) == (num_waves - 1)) {
+      // Reached num_waves in low bits so this is the last wave.
+      // Set low bits to zero and increment high bits
+      load += 0x00010000u; // wrap is safe
+      load &= 0xffff0000u; // because bits zeroed second
+
+      // Reset the wave counter and release the waiting waves
+      __atomic_store_n(&L1_Barrier, load, __ATOMIC_RELAXED);
+    } else {
+      // more waves still to go, spin until generation counter changes
+      do {
+        __builtin_amdgcn_s_sleep(0);
+        load = __atomic_load_n(&L1_Barrier, __ATOMIC_RELAXED);
+      } while ((load & 0xffff0000u) == generation);
+    }
+  }
+  __atomic_thread_fence(__ATOMIC_RELEASE);
+}
+
+namespace {
+DEVICE uint32_t get_grid_dim(uint32_t n, uint16_t d) {
+  uint32_t q = n / d;
+  return q + (n > q * d);
+}
+DEVICE uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size,
+                                  uint16_t group_size) {
+  uint32_t r = grid_size - group_id * group_size;
+  return (r < group_size) ? r : group_size;
+}
+} // namespace
+
+DEVICE int GetNumberOfBlocksInKernel() {
+  return get_grid_dim(__builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x());
+}
+
+DEVICE int GetNumberOfThreadsInBlock() {
+  return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), __builtin_amdgcn_grid_size_x(),
+                           __builtin_amdgcn_workgroup_size_x());
+}
+
+DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
+DEVICE unsigned GetLaneId() {
+  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+}
+
+EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {
+  return GetNumberOfThreadsInBlock();
+}
+
+// Stub implementations
+DEVICE void *__kmpc_impl_malloc(size_t) { return nullptr; }
+DEVICE void __kmpc_impl_free(void *) {}
+
+#pragma omp end declare target
diff --git a/libomptarget/deviceRTLs/nvptx/src/debug.h b/libomptarget/deviceRTLs/common/debug.h
similarity index 95%
rename from libomptarget/deviceRTLs/nvptx/src/debug.h
rename to libomptarget/deviceRTLs/common/debug.h
index f2fcc1d8f..6539b7ad7 100644
--- a/libomptarget/deviceRTLs/nvptx/src/debug.h
+++ b/libomptarget/deviceRTLs/common/debug.h
@@ -28,6 +28,8 @@
 #ifndef _OMPTARGET_NVPTX_DEBUG_H_
 #define _OMPTARGET_NVPTX_DEBUG_H_
 
+#include "common/device_environment.h"
+
 ////////////////////////////////////////////////////////////////////////////////
 // set desired level of debugging
 ////////////////////////////////////////////////////////////////////////////////
@@ -121,30 +123,27 @@
 #endif
 
 ////////////////////////////////////////////////////////////////////////////////
-// implemtation for debug
+// implementation for debug
 ////////////////////////////////////////////////////////////////////////////////
 
 #if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
-#include <stdio.h>
-#include "option.h"
+#include "common/support.h"
 
 template <typename... Arguments>
 NOINLINE static void log(const char *fmt, Arguments... parameters) {
-  printf(fmt, (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE),
-         (int)(threadIdx.x & 0x1F), parameters...);
+  printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
+         (int)GetWarpId(), (int)GetLaneId(), parameters...);
 }
 
 #endif
 #if OMPTARGET_NVPTX_TEST
-#include <assert.h>
 
 template <typename... Arguments>
 NOINLINE static void check(bool cond, const char *fmt,
                            Arguments... parameters) {
   if (!cond)
-    printf(fmt, (int)blockIdx.x, (int)threadIdx.x,
-           (int)(threadIdx.x / WARPSIZE), (int)(threadIdx.x & 0x1F),
-           parameters...);
+    printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
+           (int)GetWarpId(), (int)GetLaneId(), parameters...);
   assert(cond);
 }
 
@@ -195,13 +194,13 @@ NOINLINE static void check(bool cond) { assert(cond); }
   }
 #else
 
-#define DON(_flag) (FALSE)
+#define DON(_flag) (0)
 #define PRINT0(flag, str)
 #define PRINT(flag, str, _args...)
 
 #endif
 
-// for printing without worring about precision, pointers...
+// for printing without worrying about precision, pointers...
 #define P64(_x) ((unsigned long long)(_x))
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -247,7 +246,7 @@ NOINLINE static void check(bool cond) { assert(cond); }
 
 #else
 
-#define TON(_flag) (FALSE)
+#define TON(_flag) (0)
 #define ASSERT0(_flag, _cond, _str)
 #define ASSERT(_flag, _cond, _str, _args...)
 
@@ -279,7 +278,7 @@ NOINLINE static void check(bool cond) { assert(cond); }
 
 #else
 
-#define WON(_flag) (FALSE)
+#define WON(_flag) (0)
 #define WARNING0(_flag, _str)
 #define WARNING(_flag, _str, _args...)
 
diff --git a/libomptarget/deviceRTLs/common/device_environment.h b/libomptarget/deviceRTLs/common/device_environment.h
new file mode 100644
index 000000000..68a7757d2
--- /dev/null
+++ b/libomptarget/deviceRTLs/common/device_environment.h
@@ -0,0 +1,24 @@
+//===---- device_environment.h - OpenMP GPU device environment --- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Global device environment
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_DEVICE_ENVIRONMENT_H_
+#define _OMPTARGET_DEVICE_ENVIRONMENT_H_
+
+#include "target_impl.h"
+
+struct omptarget_device_environmentTy {
+  int32_t debug_level;
+};
+
+extern DEVICE omptarget_device_environmentTy omptarget_device_environment;
+
+#endif
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/common/omptarget.h
similarity index 69%
rename from libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
rename to libomptarget/deviceRTLs/common/omptarget.h
index 70e6c286a..fc4eb6bfb 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/libomptarget/deviceRTLs/common/omptarget.h
@@ -1,4 +1,4 @@
-//===---- omptarget-nvptx.h - NVPTX OpenMP GPU initialization ---- CUDA -*-===//
+//===---- omptarget.h - OpenMP GPU initialization ---------------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -11,25 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef __OMPTARGET_NVPTX_H
-#define __OMPTARGET_NVPTX_H
+#ifndef OMPTARGET_H
+#define OMPTARGET_H
 
-// std includes
-#include <stdint.h>
-#include <stdlib.h>
-
-#include <inttypes.h>
-
-// cuda includes
-#include <cuda.h>
-#include <math.h>
-
-// local includes
-#include "debug.h"     // debug
+#include "target_impl.h"
+#include "common/debug.h"     // debug
 #include "interface.h" // interfaces with omp, compiler, and user
-#include "option.h"    // choices we have
-#include "state-queue.h"
-#include "support.h"
+#include "common/state-queue.h"
+#include "common/support.h"
 
 #define OMPTARGET_NVPTX_VERSION 1.1
 
@@ -57,17 +46,16 @@ class omptarget_nvptx_SharedArgs {
     // Free any memory allocated for outlined parallel function with a large
     // number of arguments.
     if (nArgs > MAX_SHARED_ARGS) {
-      SafeFree(args, (char *)"new extended args");
+      SafeFree(args, "new extended args");
       Init();
     }
   }
   INLINE void EnsureSize(size_t size) {
     if (size > nArgs) {
       if (nArgs > MAX_SHARED_ARGS) {
-        SafeFree(args, (char *)"new extended args");
+        SafeFree(args, "new extended args");
       }
-      args = (void **) SafeMalloc(size * sizeof(void *),
-                                  (char *)"new extended args");
+      args = (void **)SafeMalloc(size * sizeof(void *), "new extended args");
       nArgs = size;
     }
   }
@@ -83,21 +71,17 @@ class omptarget_nvptx_SharedArgs {
   uint32_t nArgs;
 };
 
-extern __device__ __shared__ omptarget_nvptx_SharedArgs
+extern DEVICE SHARED omptarget_nvptx_SharedArgs
     omptarget_nvptx_globalArgs;
 
-// Data sharing related quantities, need to match what is used in the compiler.
-enum DATA_SHARING_SIZES {
-  // The maximum number of workers in a kernel.
-  DS_Max_Worker_Threads = 992,
-  // The size reserved for data in a shared memory slot.
-  DS_Slot_Size = 256,
-  // The slot size that should be reserved for a working warp.
-  DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
-  // The maximum number of warps in use
-  DS_Max_Warp_Number = 32,
-  // The size of the preallocated shared memory buffer per team
-  DS_Shared_Memory_Size = 128,
+// Worker slot type which is initialized with the default worker slot
+// size of 4*32 bytes.
+struct __kmpc_data_sharing_slot {
+  __kmpc_data_sharing_slot *Next;
+  __kmpc_data_sharing_slot *Prev;
+  void *PrevSlotStackPtr;
+  void *DataEnd;
+  char Data[DS_Worker_Warp_Slot_Size];
 };
 
 // Data structure to keep in shared memory that traces the current slot, stack,
@@ -109,25 +93,8 @@ struct DataSharingStateTy {
   void * volatile FramePtr[DS_Max_Warp_Number];
   __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number];
 };
-// Additional worker slot type which is initialized with the default worker slot
-// size of 4*32 bytes.
-struct __kmpc_data_sharing_worker_slot_static {
-  __kmpc_data_sharing_slot *Next;
-  __kmpc_data_sharing_slot *Prev;
-  void *PrevSlotStackPtr;
-  void *DataEnd;
-  char Data[DS_Worker_Warp_Slot_Size];
-};
-// Additional master slot type which is initialized with the default master slot
-// size of 4 bytes.
-struct __kmpc_data_sharing_master_slot_static {
-  __kmpc_data_sharing_slot *Next;
-  __kmpc_data_sharing_slot *Prev;
-  void *PrevSlotStackPtr;
-  void *DataEnd;
-  char Data[DS_Slot_Size];
-};
-extern __device__ __shared__ DataSharingStateTy DataSharingState;
+
+extern DEVICE SHARED DataSharingStateTy DataSharingState;
 
 ////////////////////////////////////////////////////////////////////////////////
 // task ICV and (implicit & explicit) task state
@@ -226,42 +193,10 @@ class omptarget_nvptx_TeamDescr {
   INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
     return workDescrForActiveParallel;
   }
-  INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; }
 
   // init
   INLINE void InitTeamDescr();
 
-  INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) {
-    // If this is invoked by the master thread of the master warp then intialize
-    // it with a smaller slot.
-    if (IsMasterThread) {
-      // Do not initalize this slot again if it has already been initalized.
-      if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size)
-        return 0;
-      // Initialize the pointer to the end of the slot given the size of the
-      // data section. DataEnd is non-inclusive.
-      master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size;
-      // We currently do not have a next slot.
-      master_rootS[0].Next = 0;
-      master_rootS[0].Prev = 0;
-      master_rootS[0].PrevSlotStackPtr = 0;
-      return (__kmpc_data_sharing_slot *)&master_rootS[0];
-    }
-    // Do not initalize this slot again if it has already been initalized.
-    if (worker_rootS[wid].DataEnd ==
-        &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size)
-      return 0;
-    // Initialize the pointer to the end of the slot given the size of the data
-    // section. DataEnd is non-inclusive.
-    worker_rootS[wid].DataEnd =
-        &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
-    // We currently do not have a next slot.
-    worker_rootS[wid].Next = 0;
-    worker_rootS[wid].Prev = 0;
-    worker_rootS[wid].PrevSlotStackPtr = 0;
-    return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
-  }
-
   INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
     worker_rootS[wid].DataEnd =
         &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
@@ -277,11 +212,9 @@ class omptarget_nvptx_TeamDescr {
       levelZeroTaskDescr; // icv for team master initial thread
   omptarget_nvptx_WorkDescr
       workDescrForActiveParallel; // one, ONLY for the active par
-  uint64_t lastprivateIterBuffer;
 
-  __align__(16)
-      __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
-  __align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
+  ALIGN(16)
+  __kmpc_data_sharing_slot worker_rootS[DS_Max_Warp_Number];
 };
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -303,10 +236,6 @@ class omptarget_nvptx_ThreadPrivateContext {
   INLINE uint16_t &NumThreadsForNextParallel(int tid) {
     return nextRegion.tnum[tid];
   }
-  // simd
-  INLINE uint16_t &SimdLimitForNextSimd(int tid) {
-    return nextRegion.slim[tid];
-  }
   // schedule (for dispatch)
   INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
   INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
@@ -322,7 +251,7 @@ class omptarget_nvptx_ThreadPrivateContext {
 private:
   // team context for this team
   omptarget_nvptx_TeamDescr teamContext;
-  // task ICV for implict threads in the only parallel region
+  // task ICV for implicit threads in the only parallel region
   omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM];
   // pointer where to find the current task ICV (top of the stack)
   omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM];
@@ -330,8 +259,6 @@ class omptarget_nvptx_ThreadPrivateContext {
     // Only one of the two is live at the same time.
     // parallel
     uint16_t tnum[MAX_THREADS_PER_TEAM];
-    // simd limit
-    uint16_t slim[MAX_THREADS_PER_TEAM];
   } nextRegion;
   // schedule (for dispatch)
   kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
@@ -343,15 +270,10 @@ class omptarget_nvptx_ThreadPrivateContext {
   uint64_t cnt;
 };
 
-/// Device envrionment data
-struct omptarget_device_environmentTy {
-  int32_t debug_level;
-};
-
 /// Memory manager for statically allocated memory.
 class omptarget_nvptx_SimpleMemoryManager {
 private:
-  __align__(128) struct MemDataTy {
+  ALIGN(128) struct MemDataTy {
     volatile unsigned keys[OMP_STATE_COUNT];
   } MemData[MAX_SM];
 
@@ -364,32 +286,26 @@ class omptarget_nvptx_SimpleMemoryManager {
   INLINE const void *Acquire(const void *buf, size_t size);
 };
 
-////////////////////////////////////////////////////////////////////////////////
-// global device envrionment
-////////////////////////////////////////////////////////////////////////////////
-
-extern __device__ omptarget_device_environmentTy omptarget_device_environment;
-
 ////////////////////////////////////////////////////////////////////////////////
 
 ////////////////////////////////////////////////////////////////////////////////
 // global data tables
 ////////////////////////////////////////////////////////////////////////////////
 
-extern __device__ omptarget_nvptx_SimpleMemoryManager
+extern DEVICE omptarget_nvptx_SimpleMemoryManager
     omptarget_nvptx_simpleMemoryManager;
-extern __device__ __shared__ uint32_t usedMemIdx;
-extern __device__ __shared__ uint32_t usedSlotIdx;
-extern __device__ __shared__ uint8_t
+extern DEVICE SHARED uint32_t usedMemIdx;
+extern DEVICE SHARED uint32_t usedSlotIdx;
+extern DEVICE SHARED uint8_t
     parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
-extern __device__ __shared__ uint16_t threadLimit;
-extern __device__ __shared__ uint16_t threadsInTeam;
-extern __device__ __shared__ uint16_t nThreads;
-extern __device__ __shared__
+extern DEVICE SHARED uint16_t threadLimit;
+extern DEVICE SHARED uint16_t threadsInTeam;
+extern DEVICE SHARED uint16_t nThreads;
+extern DEVICE SHARED
     omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
 
-extern __device__ __shared__ uint32_t execution_param;
-extern __device__ __shared__ void *ReductionScratchpadPtr;
+extern DEVICE SHARED uint32_t execution_param;
+extern DEVICE SHARED void *ReductionScratchpadPtr;
 
 ////////////////////////////////////////////////////////////////////////////////
 // work function (outlined parallel/simd functions) and arguments.
@@ -397,7 +313,7 @@ extern __device__ __shared__ void *ReductionScratchpadPtr;
 ////////////////////////////////////////////////////////////////////////////////
 
 typedef void *omptarget_nvptx_WorkFn;
-extern volatile __device__ __shared__ omptarget_nvptx_WorkFn
+extern volatile DEVICE SHARED omptarget_nvptx_WorkFn
     omptarget_nvptx_workFn;
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -414,7 +330,6 @@ INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
 // inlined implementation
 ////////////////////////////////////////////////////////////////////////////////
 
-#include "omptarget-nvptxi.h"
-#include "supporti.h"
+#include "common/omptargeti.h"
 
 #endif
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h b/libomptarget/deviceRTLs/common/omptargeti.h
similarity index 95%
rename from libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
rename to libomptarget/deviceRTLs/common/omptargeti.h
index e4efa1808..108724e0b 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
+++ b/libomptarget/deviceRTLs/common/omptargeti.h
@@ -1,4 +1,4 @@
-//===---- omptarget-nvptxi.h - NVPTX OpenMP GPU initialization --- CUDA -*-===//
+//===---- omptargeti.h - OpenMP GPU initialization --------------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -40,7 +40,7 @@ omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
 
   items.flags = 0;
   items.threadId = 0;         // is master
-  items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
+  items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
 }
 
 // This is called when all threads are started together in SPMD mode.
@@ -57,7 +57,7 @@ INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
       TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
   items.threadId =
       GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
-  items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
+  items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
   prev = parentTaskDescr;
 }
 
@@ -88,7 +88,7 @@ INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
 INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
     omptarget_nvptx_TaskDescr *masterTaskDescr) {
   CopyParent(masterTaskDescr);
-  // overrwrite specific items;
+  // overwrite specific items;
   items.flags |=
       TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
 }
@@ -97,7 +97,7 @@ INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
     omptarget_nvptx_TaskDescr *workTaskDescr) {
   Copy(workTaskDescr);
   //
-  // overrwrite specific items;
+  // overwrite specific items;
   //
   // The threadID should be GetThreadIdInBlock() % GetMasterThreadID().
   // This is so that the serial master (first lane in the master warp)
@@ -207,7 +207,7 @@ INLINE void omptarget_nvptx_SimpleMemoryManager::Release() {
   ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT,
           "MemIdx is too big or uninitialized.");
   MemDataTy &MD = MemData[usedSlotIdx];
-  atomicExch((unsigned *)&MD.keys[usedMemIdx], 0);
+  __kmpc_atomic_exchange((unsigned *)&MD.keys[usedMemIdx], 0u);
 }
 
 INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
@@ -217,7 +217,7 @@ INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
   const unsigned sm = usedSlotIdx;
   MemDataTy &MD = MemData[sm];
   unsigned i = hash(GetBlockIdInKernel());
-  while (atomicCAS((unsigned *)&MD.keys[i], 0, 1) != 0) {
+  while (__kmpc_atomic_cas((unsigned *)&MD.keys[i], 0u, 1u) != 0) {
     i = hash(i + 1);
   }
   usedSlotIdx = sm;
diff --git a/libomptarget/deviceRTLs/nvptx/src/cancel.cu b/libomptarget/deviceRTLs/common/src/cancel.cu
similarity index 87%
rename from libomptarget/deviceRTLs/nvptx/src/cancel.cu
rename to libomptarget/deviceRTLs/common/src/cancel.cu
index 93fc5daa9..0e5cd2b60 100644
--- a/libomptarget/deviceRTLs/nvptx/src/cancel.cu
+++ b/libomptarget/deviceRTLs/common/src/cancel.cu
@@ -9,19 +9,23 @@
 // Interface to be used in the implementation of OpenMP cancel.
 //
 //===----------------------------------------------------------------------===//
+#pragma omp declare target
 
-#include "omptarget-nvptx.h"
+#include "interface.h"
+#include "common/debug.h"
 
 EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
                                         int32_t cancelVal) {
   PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal);
   // disabled
-  return FALSE;
+  return 0;
 }
 
 EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
                              int32_t cancelVal) {
   PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", (int)cancelVal);
   // disabled
-  return FALSE;
+  return 0;
 }
+
+#pragma omp end declare target
diff --git a/libomptarget/deviceRTLs/nvptx/src/critical.cu b/libomptarget/deviceRTLs/common/src/critical.cu
similarity index 89%
rename from libomptarget/deviceRTLs/nvptx/src/critical.cu
rename to libomptarget/deviceRTLs/common/src/critical.cu
index 2eb94f5cb..3fd89c50a 100644
--- a/libomptarget/deviceRTLs/nvptx/src/critical.cu
+++ b/libomptarget/deviceRTLs/common/src/critical.cu
@@ -9,10 +9,10 @@
 // This file contains the implementation of critical with KMPC interface
 //
 //===----------------------------------------------------------------------===//
+#pragma omp declare target
 
-#include <stdio.h>
-
-#include "omptarget-nvptx.h"
+#include "interface.h"
+#include "common/debug.h"
 
 EXTERN
 void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,
@@ -27,3 +27,5 @@ void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
   PRINT0(LD_IO, "call to kmpc_end_critical()\n");
   omp_unset_lock((omp_lock_t *)lck);
 }
+
+#pragma omp end declare target
diff --git a/libomptarget/deviceRTLs/common/src/data_sharing.cu b/libomptarget/deviceRTLs/common/src/data_sharing.cu
new file mode 100644
index 000000000..577c50a8b
--- /dev/null
+++ b/libomptarget/deviceRTLs/common/src/data_sharing.cu
@@ -0,0 +1,280 @@
+//===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of data sharing environments
+//
+//===----------------------------------------------------------------------===//
+#pragma omp declare target
+
+#include "common/omptarget.h"
+#include "target_impl.h"
+
+// Return true if this is the master thread.
+INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
+  return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Runtime functions for trunk data sharing scheme.
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE static void data_sharing_init_stack_common() {
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
+  omptarget_nvptx_TeamDescr *teamDescr =
+      &omptarget_nvptx_threadPrivateContext->TeamContext();
+
+  for (int WID = 0; WID < DS_Max_Warp_Number; WID++) {
+    __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
+    DataSharingState.SlotPtr[WID] = RootS;
+    DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
+  }
+}
+
+// Initialize data sharing data structure. This function needs to be called
+// once at the beginning of a data sharing context (coincides with the kernel
+// initialization). This function is called only by the MASTER thread of each
+// team in non-SPMD mode.
+EXTERN void __kmpc_data_sharing_init_stack() {
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
+  // This function initializes the stack pointer with the pointer to the
+  // statically allocated shared memory slots. The size of a shared memory
+  // slot is pre-determined to be 256 bytes.
+  data_sharing_init_stack_common();
+  omptarget_nvptx_globalArgs.Init();
+}
+
+// Initialize data sharing data structure. This function needs to be called
+// once at the beginning of a data sharing context (coincides with the kernel
+// initialization). This function is called in SPMD mode only.
+EXTERN void __kmpc_data_sharing_init_stack_spmd() {
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
+  // This function initializes the stack pointer with the pointer to the
+  // statically allocated shared memory slots. The size of a shared memory
+  // slot is pre-determined to be 256 bytes.
+  if (GetThreadIdInBlock() == 0)
+    data_sharing_init_stack_common();
+
+  __kmpc_impl_threadfence_block();
+}
+
+INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
+
+  // Only warp active master threads manage the stack.
+  bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0;
+
+  // Add worst-case padding to DataSize so that future stack allocations are
+  // correctly aligned.
+  const size_t Alignment = 8;
+  PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
+
+  // Frame pointer must be visible to all workers in the same warp.
+  const unsigned WID = GetWarpId();
+  void *FrameP = 0;
+  __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
+
+  if (IsWarpMaster) {
+    // SlotP will point to either the shared memory slot or an existing
+    // global memory slot.
+    __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+    void *&StackP = DataSharingState.StackPtr[WID];
+
+    // Check if we have room for the data in the current slot.
+    const uintptr_t StartAddress = (uintptr_t)StackP;
+    const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
+    const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize;
+
+    // If we requested more data than there is room for in the rest
+    // of the slot then we need to either re-use the next slot, if one exists,
+    // or create a new slot.
+    if (EndAddress < RequestedEndAddress) {
+      __kmpc_data_sharing_slot *NewSlot = 0;
+      size_t NewSize = PushSize;
+
+      // Allocate at least the default size for each type of slot.
+      // Master is a special case and even though there is only one thread,
+      // it can share more things with the workers. For uniformity, it uses
+      // the full size of a worker warp slot.
+      size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
+      if (DefaultSlotSize > NewSize)
+        NewSize = DefaultSlotSize;
+      NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc(
+          sizeof(__kmpc_data_sharing_slot) + NewSize,
+          "Global memory slot allocation.");
+
+      NewSlot->Next = 0;
+      NewSlot->Prev = SlotP;
+      NewSlot->PrevSlotStackPtr = StackP;
+      NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;
+
+      // Make previous slot point to the newly allocated slot.
+      SlotP->Next = NewSlot;
+      // The current slot becomes the new slot.
+      SlotP = NewSlot;
+      // The stack pointer always points to the next free stack frame.
+      StackP = &NewSlot->Data[0] + PushSize;
+      // The frame pointer always points to the beginning of the frame.
+      FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0];
+    } else {
+      // Add the data chunk to the current slot. The frame pointer is set to
+      // point to the start of the new frame held in StackP.
+      FrameP = DataSharingState.FramePtr[WID] = StackP;
+      // Reset stack pointer to the requested address.
+      StackP = (void *)RequestedEndAddress;
+    }
+  }
+  // Get address from lane 0.
+  int *FP = (int *)&FrameP;
+  FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);
+  if (sizeof(FrameP) == 8)
+    FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);
+
+  return FrameP;
+}
+
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
+                                                      int16_t UseSharedMemory) {
+  return data_sharing_push_stack_common(DataSize);
+}
+
+// Called at the time of the kernel initialization. This is used to initilize
+// the list of references to shared variables and to pre-allocate global storage
+// for holding the globalized variables.
+//
+// By default the globalized variables are stored in global memory. If the
+// UseSharedMemory is set to true, the runtime will attempt to use shared memory
+// as long as the size requested fits the pre-allocated size.
+EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
+                                            int16_t UseSharedMemory) {
+  // Compute the total memory footprint of the requested data.
+  // The master thread requires a stack only for itself. A worker
+  // thread (which at this point is a warp master) will require
+  // space for the variables of each thread in the warp,
+  // i.e. one DataSize chunk per warp lane.
+  // TODO: change WARPSIZE to the number of active threads in the warp.
+  size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode()))
+                        ? DataSize
+                        : WARPSIZE * DataSize;
+
+  // Compute the start address of the frame of each thread in the warp.
+  uintptr_t FrameStartAddress =
+      (uintptr_t) data_sharing_push_stack_common(PushSize);
+  FrameStartAddress += (uintptr_t) (GetLaneId() * DataSize);
+  return (void *)FrameStartAddress;
+}
+
+// Pop the stack and free any memory which can be reclaimed.
+//
+// When the pop operation removes the last global memory slot,
+// reclaim all outstanding global memory slots since it is
+// likely we have reached the end of the kernel.
+EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
+
+  __kmpc_impl_threadfence_block();
+
+  if (GetThreadIdInBlock() % WARPSIZE == 0) {
+    unsigned WID = GetWarpId();
+
+    // Current slot
+    __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+
+    // Pointer to next available stack.
+    void *&StackP = DataSharingState.StackPtr[WID];
+
+    // Pop the frame.
+    StackP = FrameStart;
+
+    // If the current slot is empty, we need to free the slot after the
+    // pop.
+    bool SlotEmpty = (StackP == &SlotP->Data[0]);
+
+    if (SlotEmpty && SlotP->Prev) {
+      // Before removing the slot we need to reset StackP.
+      StackP = SlotP->PrevSlotStackPtr;
+
+      // Remove the slot.
+      SlotP = SlotP->Prev;
+      SafeFree(SlotP->Next, "Free slot.");
+      SlotP->Next = 0;
+    }
+  }
+}
+
+// Begin a data sharing context. Maintain a list of references to shared
+// variables. This list of references to shared variables will be passed
+// to one or more threads.
+// In L0 data sharing this is called by master thread.
+// In L1 data sharing this is called by active warp master thread.
+EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
+  omptarget_nvptx_globalArgs.EnsureSize(nArgs);
+  *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+}
+
+// End a data sharing context. There is no need to have a list of refs
+// to shared variables because the context in which those variables were
+// shared has now ended. This should clean-up the list of references only
+// without affecting the actual global storage of the variables.
+// In L0 data sharing this is called by master thread.
+// In L1 data sharing this is called by active warp master thread.
+EXTERN void __kmpc_end_sharing_variables() {
+  omptarget_nvptx_globalArgs.DeInit();
+}
+
+// This function will return a list of references to global variables. This
+// is how the workers will get a reference to the globalized variable. The
+// members of this list will be passed to the outlined parallel function
+// preserving the order.
+// Called by all workers.
+EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
+  *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+}
+
+// This function is used to init static memory manager. This manager is used to
+// manage statically allocated global memory. This memory is allocated by the
+// compiler and used to correctly implement globalization of the variables in
+// target, teams and distribute regions.
+EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
+                                          const void *buf, size_t size,
+                                          int16_t is_shared,
+                                          const void **frame) {
+  if (is_shared) {
+    *frame = buf;
+    return;
+  }
+  if (isSPMDExecutionMode) {
+    if (GetThreadIdInBlock() == 0) {
+      *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
+    }
+    __kmpc_impl_syncthreads();
+    return;
+  }
+  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
+          "Must be called only in the target master thread.");
+  *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
+  __kmpc_impl_threadfence();
+}
+
+EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
+                                              int16_t is_shared) {
+  if (is_shared)
+    return;
+  if (isSPMDExecutionMode) {
+    __kmpc_impl_syncthreads();
+    if (GetThreadIdInBlock() == 0) {
+      omptarget_nvptx_simpleMemoryManager.Release();
+    }
+    return;
+  }
+  __kmpc_impl_threadfence();
+  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
+          "Must be called only in the target master thread.");
+  omptarget_nvptx_simpleMemoryManager.Release();
+}
+
+#pragma omp end declare target
diff --git a/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/libomptarget/deviceRTLs/common/src/libcall.cu
similarity index 81%
rename from libomptarget/deviceRTLs/nvptx/src/libcall.cu
rename to libomptarget/deviceRTLs/common/src/libcall.cu
index 9580d7596..f43d74a0c 100644
--- a/libomptarget/deviceRTLs/nvptx/src/libcall.cu
+++ b/libomptarget/deviceRTLs/common/src/libcall.cu
@@ -1,4 +1,4 @@
-//===------------ libcall.cu - NVPTX OpenMP user calls ----------- CUDA -*-===//
+//===------------ libcall.cu - OpenMP GPU user calls ------------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,21 +10,19 @@
 // invoked by the user in an OpenMP region
 //
 //===----------------------------------------------------------------------===//
+#pragma omp declare target
 
-#include "omptarget-nvptx.h"
-
-// Timer precision is 1ns
-#define TIMER_PRECISION ((double)1E-9)
+#include "common/omptarget.h"
+#include "target_impl.h"
 
 EXTERN double omp_get_wtick(void) {
-  PRINT(LD_IO, "omp_get_wtick() returns %g\n", TIMER_PRECISION);
-  return TIMER_PRECISION;
+  double rc = __kmpc_impl_get_wtick();
+  PRINT(LD_IO, "omp_get_wtick() returns %g\n", rc);
+  return rc;
 }
 
 EXTERN double omp_get_wtime(void) {
-  unsigned long long nsecs;
-  asm("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
-  double rc = (double)nsecs * TIMER_PRECISION;
+  double rc = __kmpc_impl_get_wtime();
   PRINT(LD_IO, "call omp_get_wtime() returns %g\n", rc);
   return rc;
 }
@@ -92,7 +90,7 @@ EXTERN int omp_in_parallel(void) {
 EXTERN int omp_in_final(void) {
   // treat all tasks as final... Specs may expect runtime to keep
   // track more precisely if a task was actively set by users... This
-  // is not explicitely specified; will treat as if runtime can
+  // is not explicitly specified; will treat as if runtime can
   // actively decide to put a non-final task into a final one.
   int rc = 1;
   PRINT(LD_IO, "call omp_in_final() returns %d\n", rc);
@@ -286,7 +284,7 @@ EXTERN void omp_get_partition_place_nums(int *place_nums) {
 }
 
 EXTERN int omp_get_cancellation(void) {
-  int rc = FALSE; // currently false only
+  int rc = 0;
   PRINT(LD_IO, "call omp_get_cancellation() returns %d\n", rc);
   return rc;
 }
@@ -339,102 +337,30 @@ EXTERN int omp_get_max_task_priority(void) {
 // locks
 ////////////////////////////////////////////////////////////////////////////////
 
-#define __OMP_SPIN 1000
-#define UNSET 0
-#define SET 1
-
 EXTERN void omp_init_lock(omp_lock_t *lock) {
-  omp_unset_lock(lock);
+  __kmpc_impl_init_lock(lock);
   PRINT0(LD_IO, "call omp_init_lock()\n");
 }
 
 EXTERN void omp_destroy_lock(omp_lock_t *lock) {
-  omp_unset_lock(lock);
+  __kmpc_impl_destroy_lock(lock);
   PRINT0(LD_IO, "call omp_destroy_lock()\n");
 }
 
 EXTERN void omp_set_lock(omp_lock_t *lock) {
-  // int atomicCAS(int* address, int compare, int val);
-  // (old == compare ? val : old)
-
-  // TODO: not sure spinning is a good idea here..
-  while (atomicCAS(lock, UNSET, SET) != UNSET) {
-    clock_t start = clock();
-    clock_t now;
-    for (;;) {
-      now = clock();
-      clock_t cycles = now > start ? now - start : now + (0xffffffff - start);
-      if (cycles >= __OMP_SPIN * blockIdx.x) {
-        break;
-      }
-    }
-  } // wait for 0 to be the read value
-
+  __kmpc_impl_set_lock(lock);
   PRINT0(LD_IO, "call omp_set_lock()\n");
 }
 
 EXTERN void omp_unset_lock(omp_lock_t *lock) {
-  (void)atomicExch(lock, UNSET);
-
+  __kmpc_impl_unset_lock(lock);
   PRINT0(LD_IO, "call omp_unset_lock()\n");
 }
 
 EXTERN int omp_test_lock(omp_lock_t *lock) {
-  // int atomicCAS(int* address, int compare, int val);
-  // (old == compare ? val : old)
-  int ret = atomicAdd(lock, 0);
-
-  PRINT(LD_IO, "call omp_test_lock() return %d\n", ret);
-
-  return ret;
-}
-
-// for xlf Fotran
-// Fotran, the return is LOGICAL type
-
-#define FLOGICAL long
-EXTERN FLOGICAL __xlf_omp_is_initial_device_i8() {
-  int ret = omp_is_initial_device();
-  if (ret == 0)
-    return (FLOGICAL)0;
-  else
-    return (FLOGICAL)1;
-}
-
-EXTERN int __xlf_omp_is_initial_device_i4() {
-  int ret = omp_is_initial_device();
-  if (ret == 0)
-    return 0;
-  else
-    return 1;
-}
-
-EXTERN long __xlf_omp_get_team_num_i4() {
-  int ret = omp_get_team_num();
-  return (long)ret;
-}
-
-EXTERN long __xlf_omp_get_num_teams_i4() {
-  int ret = omp_get_num_teams();
-  return (long)ret;
-}
-
-EXTERN void xlf_debug_print_int(int *p) {
-  printf("xlf DEBUG %d): %p %d\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
-}
-
-EXTERN void xlf_debug_print_long(long *p) {
-  printf("xlf DEBUG %d): %p %ld\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
-}
-
-EXTERN void xlf_debug_print_float(float *p) {
-  printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
-}
-
-EXTERN void xlf_debug_print_double(double *p) {
-  printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
+  int rc = __kmpc_impl_test_lock(lock);
+  PRINT(LD_IO, "call omp_test_lock() return %d\n", rc);
+  return rc;
 }
 
-EXTERN void xlf_debug_print_addr(void *p) {
-  printf("xlf DEBUG %d): %p \n", omp_get_team_num(), p);
-}
+#pragma omp end declare target
diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/common/src/loop.cu
similarity index 93%
rename from libomptarget/deviceRTLs/nvptx/src/loop.cu
rename to libomptarget/deviceRTLs/common/src/loop.cu
index ee378460a..a3ace098f 100644
--- a/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/libomptarget/deviceRTLs/common/src/loop.cu
@@ -11,8 +11,9 @@
 // interface as loops.
 //
 //===----------------------------------------------------------------------===//
+#pragma omp declare target
 
-#include "omptarget-nvptx.h"
+#include "common/omptarget.h"
 #include "target_impl.h"
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -170,7 +171,7 @@ public:
       break;
     }
     default: {
-      ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", (int)schedtype);
+      ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
       PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
             (int)schedtype);
       ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
@@ -220,7 +221,7 @@ public:
      * When it is we'll want to look at them somewhere here and use that
      * information to add to our schedule choice. We shouldn't need to pass
      * them on, they merely affect which schedule we can legally choose for
-     * various dynamic cases. (In paritcular, whether or not a stealing scheme
+     * various dynamic cases. (In particular, whether or not a stealing scheme
      * is legal).
      */
     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
@@ -363,7 +364,7 @@ public:
       __kmpc_barrier(loc, threadId);
       if (tid == 0) {
         omptarget_nvptx_threadPrivateContext->Cnt() = 0;
-        __threadfence_block();
+        __kmpc_impl_threadfence_block();
       }
       __kmpc_barrier(loc, threadId);
       PRINT(LD_LOOP,
@@ -397,9 +398,9 @@ public:
     unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt);
     uint64_t warp_res;
     if (rank == 0) {
-      warp_res = atomicAdd(
+      warp_res = __kmpc_atomic_add(
           (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
-          change);
+          (unsigned long long)change);
     }
     warp_res = Shuffle(active, warp_res, leader);
     return warp_res + rank;
@@ -754,54 +755,4 @@ EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
   PRINT0(LD_IO, "call kmpc_for_static_fini\n");
 }
 
-namespace {
-INLINE void syncWorkersInGenericMode(uint32_t NumThreads) {
-  int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  // On Volta and newer architectures we require that all lanes in
-  // a warp (at least, all present for the kernel launch) participate in the
-  // barrier.  This is enforced when launching the parallel region.  An
-  // exception is when there are < WARPSIZE workers.  In this case only 1 worker
-  // is started, so we don't need a barrier.
-  if (NumThreads > 1) {
-#endif
-    named_sync(L1_BARRIER, WARPSIZE * NumWarps);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  }
-#endif
-}
-}; // namespace
-
-EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
-                                                  int32_t varNum, void *array) {
-  PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n");
-  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
-          "Expected non-SPMD mode + initialized runtime.");
-
-  omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
-  uint32_t NumThreads = GetNumberOfOmpThreads(checkSPMDMode(loc));
-  uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
-  for (unsigned i = 0; i < varNum; i++) {
-    // Reset buffer.
-    if (gtid == 0)
-      *Buffer = 0; // Reset to minimum loop iteration value.
-
-    // Barrier.
-    syncWorkersInGenericMode(NumThreads);
-
-    // Atomic max of iterations.
-    uint64_t *varArray = (uint64_t *)array;
-    uint64_t elem = varArray[i];
-    (void)atomicMax((unsigned long long int *)Buffer,
-                    (unsigned long long int)elem);
-
-    // Barrier.
-    syncWorkersInGenericMode(NumThreads);
-
-    // Read max value and update thread private array.
-    varArray[i] = *Buffer;
-
-    // Barrier.
-    syncWorkersInGenericMode(NumThreads);
-  }
-}
+#pragma omp end declare target
diff --git a/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/libomptarget/deviceRTLs/common/src/omp_data.cu
similarity index 72%
rename from libomptarget/deviceRTLs/nvptx/src/omp_data.cu
rename to libomptarget/deviceRTLs/common/src/omp_data.cu
index d369da1cb..b69affdc2 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
+++ b/libomptarget/deviceRTLs/common/src/omp_data.cu
@@ -1,4 +1,4 @@
-//===------------ omp_data.cu - NVPTX OpenMP GPU objects --------- CUDA -*-===//
+//===------------ omp_data.cu - OpenMP GPU objects --------------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,34 +9,36 @@
 // This file contains the data objects used on the GPU device.
 //
 //===----------------------------------------------------------------------===//
+#pragma omp declare target
 
-#include "omptarget-nvptx.h"
+#include "common/omptarget.h"
+#include "common/device_environment.h"
 
 ////////////////////////////////////////////////////////////////////////////////
-// global device envrionment
+// global device environment
 ////////////////////////////////////////////////////////////////////////////////
 
-__device__ omptarget_device_environmentTy omptarget_device_environment;
+DEVICE omptarget_device_environmentTy omptarget_device_environment;
 
 ////////////////////////////////////////////////////////////////////////////////
 // global data holding OpenMP state information
 ////////////////////////////////////////////////////////////////////////////////
 
-__device__
+DEVICE
     omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
         omptarget_nvptx_device_State[MAX_SM];
 
-__device__ omptarget_nvptx_SimpleMemoryManager
+DEVICE omptarget_nvptx_SimpleMemoryManager
     omptarget_nvptx_simpleMemoryManager;
-__device__ __shared__ uint32_t usedMemIdx;
-__device__ __shared__ uint32_t usedSlotIdx;
+DEVICE SHARED uint32_t usedMemIdx;
+DEVICE SHARED uint32_t usedSlotIdx;
 
-__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
-__device__ __shared__ uint16_t threadLimit;
-__device__ __shared__ uint16_t threadsInTeam;
-__device__ __shared__ uint16_t nThreads;
+DEVICE SHARED uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
+DEVICE SHARED uint16_t threadLimit;
+DEVICE SHARED uint16_t threadsInTeam;
+DEVICE SHARED uint16_t nThreads;
 // Pointer to this team's OpenMP state object
-__device__ __shared__
+DEVICE SHARED
     omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -44,24 +46,26 @@ __device__ __shared__
 // communicate with the workers.  Since it is in shared memory, there is one
 // copy of these variables for each kernel, instance, and team.
 ////////////////////////////////////////////////////////////////////////////////
-volatile __device__ __shared__ omptarget_nvptx_WorkFn omptarget_nvptx_workFn;
+volatile DEVICE SHARED omptarget_nvptx_WorkFn omptarget_nvptx_workFn;
 
 ////////////////////////////////////////////////////////////////////////////////
 // OpenMP kernel execution parameters
 ////////////////////////////////////////////////////////////////////////////////
-__device__ __shared__ uint32_t execution_param;
+DEVICE SHARED uint32_t execution_param;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Data sharing state
 ////////////////////////////////////////////////////////////////////////////////
-__device__ __shared__ DataSharingStateTy DataSharingState;
+DEVICE SHARED DataSharingStateTy DataSharingState;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scratchpad for teams reduction.
 ////////////////////////////////////////////////////////////////////////////////
-__device__ __shared__ void *ReductionScratchpadPtr;
+DEVICE SHARED void *ReductionScratchpadPtr;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Data sharing related variables.
 ////////////////////////////////////////////////////////////////////////////////
-__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
+DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
+
+#pragma omp end declare target
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/libomptarget/deviceRTLs/common/src/omptarget.cu
similarity index 84%
rename from libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
rename to libomptarget/deviceRTLs/common/src/omptarget.cu
index c84c05544..82d1db1dc 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ b/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -1,4 +1,4 @@
-//===--- omptarget-nvptx.cu - NVPTX OpenMP GPU initialization ---- CUDA -*-===//
+//===--- omptarget.cu - OpenMP GPU initialization ---------------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,15 +9,16 @@
 // This file contains the initialization code for the GPU
 //
 //===----------------------------------------------------------------------===//
+#pragma omp declare target
 
-#include "omptarget-nvptx.h"
+#include "common/omptarget.h"
 #include "target_impl.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 // global data tables
 ////////////////////////////////////////////////////////////////////////////////
 
-extern __device__
+extern DEVICE
     omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
         omptarget_nvptx_device_State[MAX_SM];
 
@@ -25,19 +26,6 @@ extern __device__
 // init entry points
 ////////////////////////////////////////////////////////////////////////////////
 
-INLINE static unsigned smid() {
-  unsigned id;
-  asm("mov.u32 %0, %%smid;" : "=r"(id));
-  return id;
-}
-
-EXTERN void __kmpc_kernel_init_params(void *Ptr) {
-  PRINT(LD_IO, "call to __kmpc_kernel_init_params with version %f\n",
-        OMPTARGET_NVPTX_VERSION);
-
-  SetTeamsReductionScratchpadPtr(Ptr);
-}
-
 EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
   PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
         OMPTARGET_NVPTX_VERSION);
@@ -53,7 +41,7 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
   PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n");
 
   // Get a state object from the queue.
-  int slot = smid() % MAX_SM;
+  int slot = __kmpc_impl_smid() % MAX_SM;
   usedSlotIdx = slot;
   omptarget_nvptx_threadPrivateContext =
       omptarget_nvptx_device_State[slot].Dequeue();
@@ -74,8 +62,9 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
   // set number of threads and thread limit in team to started value
   omptarget_nvptx_TaskDescr *currTaskDescr =
       omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-  nThreads = GetNumberOfWorkersInTeam();
+  nThreads = GetNumberOfThreadsInBlock();
   threadLimit = ThreadLimit;
+  __kmpc_impl_target_init();
 }
 
 EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
@@ -90,15 +79,14 @@ EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
   omptarget_nvptx_workFn = 0;
 }
 
-EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
-                                    int16_t RequiresDataSharing) {
+EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
   PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n");
 
   setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized
                                                   : RuntimeUninitialized);
   int threadId = GetThreadIdInBlock();
   if (threadId == 0) {
-    usedSlotIdx = smid() % MAX_SM;
+    usedSlotIdx = __kmpc_impl_smid() % MAX_SM;
     parallelLevel[0] =
         1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
   } else if (GetLaneId() == 0) {
@@ -147,19 +135,6 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
         "thread will execute parallel region with id %d in a team of "
         "%d threads\n",
         (int)newTaskDescr->ThreadId(), (int)ThreadLimit);
-
-  if (RequiresDataSharing && GetLaneId() == 0) {
-    // Warp master innitializes data sharing environment.
-    unsigned WID = threadId / WARPSIZE;
-    __kmpc_data_sharing_slot *RootS = currTeamDescr.RootS(
-        WID, WID == WARPSIZE - 1);
-    DataSharingState.SlotPtr[WID] = RootS;
-    DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
-  }
-}
-
-EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit() {
-  __kmpc_spmd_kernel_deinit_v2(isRuntimeInitialized());
 }
 
 EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) {
@@ -183,3 +158,5 @@ EXTERN int8_t __kmpc_is_spmd_exec_mode() {
   PRINT0(LD_IO | LD_PAR, "call to __kmpc_is_spmd_exec_mode\n");
   return isSPMDMode();
 }
+
+#pragma omp end declare target
diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/common/src/parallel.cu
similarity index 59%
rename from libomptarget/deviceRTLs/nvptx/src/parallel.cu
rename to libomptarget/deviceRTLs/common/src/parallel.cu
index 016ded8a5..790730ffd 100644
--- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/libomptarget/deviceRTLs/common/src/parallel.cu
@@ -1,4 +1,4 @@
-//===---- parallel.cu - NVPTX OpenMP parallel implementation ----- CUDA -*-===//
+//===---- parallel.cu - GPU OpenMP parallel implementation ------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Parallel implemention in the GPU. Here is the pattern:
+// Parallel implementation in the GPU. Here is the pattern:
 //
 //    while (not finished) {
 //
@@ -31,165 +31,11 @@
 //    To make a long story short...
 //
 //===----------------------------------------------------------------------===//
+#pragma omp declare target
 
-#include "omptarget-nvptx.h"
+#include "common/omptarget.h"
 #include "target_impl.h"
 
-typedef struct ConvergentSimdJob {
-  omptarget_nvptx_TaskDescr taskDescr;
-  omptarget_nvptx_TaskDescr *convHeadTaskDescr;
-  uint16_t slimForNextSimd;
-} ConvergentSimdJob;
-
-////////////////////////////////////////////////////////////////////////////////
-// support for convergent simd (team of threads in a warp only)
-////////////////////////////////////////////////////////////////////////////////
-EXTERN bool __kmpc_kernel_convergent_simd(void *buffer,
-                                          __kmpc_impl_lanemask_t Mask,
-                                          bool *IsFinal, int32_t *LaneSource,
-                                          int32_t *LaneId, int32_t *NumLanes) {
-  PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
-  __kmpc_impl_lanemask_t ConvergentMask = Mask;
-  int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
-  __kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
-  *LaneSource += __kmpc_impl_ffs(WorkRemaining);
-  *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
-  __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
-  *LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
-
-  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
-  int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
-
-  ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
-  int32_t SimdLimit =
-      omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
-  job->slimForNextSimd = SimdLimit;
-
-  int32_t SimdLimitSource = __kmpc_impl_shfl_sync(Mask, SimdLimit, *LaneSource);
-  // reset simdlimit to avoid propagating to successive #simd
-  if (SimdLimitSource > 0 && threadId == sourceThreadId)
-    omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;
-
-  // We cannot have more than the # of convergent threads.
-  if (SimdLimitSource > 0)
-    *NumLanes = min(ConvergentSize, SimdLimitSource);
-  else
-    *NumLanes = ConvergentSize;
-  ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
-         (int)*NumLanes);
-
-  // Set to true for lanes participating in the simd region.
-  bool isActive = false;
-  // Initialize state for active threads.
-  if (*LaneId < *NumLanes) {
-    omptarget_nvptx_TaskDescr *currTaskDescr =
-        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-    omptarget_nvptx_TaskDescr *sourceTaskDescr =
-        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
-            sourceThreadId);
-    job->convHeadTaskDescr = currTaskDescr;
-    // install top descriptor from the thread for which the lanes are working.
-    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
-                                                               sourceTaskDescr);
-    isActive = true;
-  }
-
-  // requires a memory fence between threads of a warp
-  return isActive;
-}
-
-EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) {
-  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
-  // pop stack
-  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
-  ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
-  omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) =
-      job->slimForNextSimd;
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
-      threadId, job->convHeadTaskDescr);
-}
-
-typedef struct ConvergentParallelJob {
-  omptarget_nvptx_TaskDescr taskDescr;
-  omptarget_nvptx_TaskDescr *convHeadTaskDescr;
-  uint16_t tnumForNextPar;
-} ConvergentParallelJob;
-
-////////////////////////////////////////////////////////////////////////////////
-// support for convergent parallelism (team of threads in a warp only)
-////////////////////////////////////////////////////////////////////////////////
-EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer,
-                                              __kmpc_impl_lanemask_t Mask,
-                                              bool *IsFinal,
-                                              int32_t *LaneSource) {
-  PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
-  __kmpc_impl_lanemask_t ConvergentMask = Mask;
-  int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
-  __kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
-  *LaneSource += __kmpc_impl_ffs(WorkRemaining);
-  *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
-  __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
-  uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
-
-  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
-  int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
-
-  ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
-  int32_t NumThreadsClause =
-      omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
-  job->tnumForNextPar = NumThreadsClause;
-
-  int32_t NumThreadsSource =
-      __kmpc_impl_shfl_sync(Mask, NumThreadsClause, *LaneSource);
-  // reset numthreads to avoid propagating to successive #parallel
-  if (NumThreadsSource > 0 && threadId == sourceThreadId)
-    omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
-        0;
-
-  // We cannot have more than the # of convergent threads.
-  uint16_t NumThreads;
-  if (NumThreadsSource > 0)
-    NumThreads = min(ConvergentSize, NumThreadsSource);
-  else
-    NumThreads = ConvergentSize;
-  ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
-         (int)NumThreads);
-
-  // Set to true for workers participating in the parallel region.
-  bool isActive = false;
-  // Initialize state for active threads.
-  if (OmpId < NumThreads) {
-    // init L2 task descriptor and storage for the L1 parallel task descriptor.
-    omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr;
-    ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
-    omptarget_nvptx_TaskDescr *currTaskDescr =
-        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-    omptarget_nvptx_TaskDescr *sourceTaskDescr =
-        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
-            sourceThreadId);
-    job->convHeadTaskDescr = currTaskDescr;
-    newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads);
-    // install new top descriptor
-    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
-                                                               newTaskDescr);
-    isActive = true;
-  }
-
-  // requires a memory fence between threads of a warp
-  return isActive;
-}
-
-EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
-  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
-  // pop stack
-  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
-  ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
-      threadId, job->convHeadTaskDescr);
-  omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
-      job->tnumForNextPar;
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // support for parallel that goes parallel (1 static level only)
 ////////////////////////////////////////////////////////////////////////////////
@@ -227,10 +73,8 @@ INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
 }
 
 // This routine is always called by the team master..
-EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
-                                           int16_t IsOMPRuntimeInitialized) {
+EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn) {
   PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
-  ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Expected initialized runtime.");
 
   omptarget_nvptx_workFn = WorkFn;
 
@@ -275,12 +119,9 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
 // returns True if this thread is active, else False.
 //
 // Only the worker threads call this routine.
-EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
-                                   int16_t IsOMPRuntimeInitialized) {
+EXTERN bool __kmpc_kernel_parallel(void **WorkFn) {
   PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
 
-  ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Expected initialized runtime.");
-
   // Work function and arguments for L1 parallel region.
   *WorkFn = omptarget_nvptx_workFn;
 
@@ -413,7 +254,7 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
       threadId, currTaskDescr->GetPrevTaskDescr());
   // free
-  SafeFree(currTaskDescr, (char *)"new seq parallel task");
+  SafeFree(currTaskDescr, "new seq parallel task");
   currTaskDescr = getMyTopTaskDescriptor(threadId);
   currTaskDescr->RestoreLoopData();
 }
@@ -446,21 +287,13 @@ EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
       num_threads;
 }
 
-EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid,
-                                   int32_t simd_limit) {
-  PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit);
-  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
-  tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-  omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
-}
-
 // Do nothing. The host guarantees we started the requested number of
 // teams and we only need inspection of gridDim.
 
 EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
                                   int32_t num_teams, int32_t thread_limit) {
   PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams);
-  ASSERT0(LT_FUSSY, FALSE,
+  ASSERT0(LT_FUSSY, 0,
           "should never have anything with new teams on device");
 }
 
@@ -468,3 +301,5 @@ EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid,
                                   int proc_bind) {
   PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
 }
+
+#pragma omp end declare target
diff --git a/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/libomptarget/deviceRTLs/common/src/reduction.cu
similarity index 52%
rename from libomptarget/deviceRTLs/nvptx/src/reduction.cu
rename to libomptarget/deviceRTLs/common/src/reduction.cu
index cee3e5d6d..0cfae1fc4 100644
--- a/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ b/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -1,5 +1,4 @@
-//===---- reduction.cu - NVPTX OpenMP reduction implementation ---- CUDA
-//-*-===//
+//===---- reduction.cu - GPU OpenMP reduction implementation ----- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,11 +9,9 @@
 // This file contains the implementation of reduction with KMPC interface.
 //
 //===----------------------------------------------------------------------===//
+#pragma omp declare target
 
-#include <complex.h>
-#include <stdio.h>
-
-#include "omptarget-nvptx.h"
+#include "common/omptarget.h"
 #include "target_impl.h"
 
 EXTERN
@@ -57,6 +54,7 @@ INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
   }
 }
 
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
 INLINE static uint32_t
 gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
   uint32_t size, remote_id, physical_lane_id;
@@ -75,22 +73,7 @@ gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
   } while (logical_lane_id % 2 == 0 && size > 1);
   return (logical_lane_id == 0);
 }
-
-EXTERN
-int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars,
-                                        size_t reduce_size, void *reduce_data,
-                                        kmp_ShuffleReductFctPtr shflFct,
-                                        kmp_InterWarpCopyFctPtr cpyFct) {
-  __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
-  if (Liveness == __kmpc_impl_all_lanes) {
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-    return GetThreadIdInBlock() % WARPSIZE ==
-           0; // Result on lane 0 of the simd warp.
-  } else {
-    return gpu_irregular_simd_reduce(
-        reduce_data, shflFct); // Result on the first active lane.
-  }
-}
+#endif
 
 INLINE
 static int32_t nvptx_parallel_reduce_nowait(
@@ -180,14 +163,6 @@ static int32_t nvptx_parallel_reduce_nowait(
 #endif // __CUDA_ARCH__ >= 700
 }
 
-EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
-  return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
-                                      reduce_data, shflFct, cpyFct,
-                                      isSPMDMode(), isRuntimeUninitialized());
-}
-
 EXTERN
 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
     kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
@@ -198,201 +173,6 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
       checkSPMDMode(loc), checkRuntimeUninitialized(loc));
 }
 
-EXTERN
-int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
-  return nvptx_parallel_reduce_nowait(
-      global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
-      /*isSPMDExecutionMode=*/true, /*isRuntimeUninitialized=*/true);
-}
-
-EXTERN
-int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
-  return nvptx_parallel_reduce_nowait(
-      global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
-      /*isSPMDExecutionMode=*/false, /*isRuntimeUninitialized=*/true);
-}
-
-INLINE
-static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
-                                         size_t reduce_size, void *reduce_data,
-                                         kmp_ShuffleReductFctPtr shflFct,
-                                         kmp_InterWarpCopyFctPtr cpyFct,
-                                         kmp_CopyToScratchpadFctPtr scratchFct,
-                                         kmp_LoadReduceFctPtr ldFct,
-                                         bool isSPMDExecutionMode) {
-  uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
-  // In non-generic mode all workers participate in the teams reduction.
-  // In generic mode only the team master participates in the teams
-  // reduction because the workers are waiting for parallel work.
-  uint32_t NumThreads =
-      isSPMDExecutionMode ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
-                          : /*Master thread only*/ 1;
-  uint32_t TeamId = GetBlockIdInKernel();
-  uint32_t NumTeams = GetNumberOfBlocksInKernel();
-  __shared__ volatile bool IsLastTeam;
-
-  // Team masters of all teams write to the scratchpad.
-  if (ThreadId == 0) {
-    unsigned int *timestamp = GetTeamsReductionTimestamp();
-    char *scratchpad = GetTeamsReductionScratchpad();
-
-    scratchFct(reduce_data, scratchpad, TeamId, NumTeams);
-    __threadfence();
-
-    // atomicInc increments 'timestamp' and has a range [0, NumTeams-1].
-    // It resets 'timestamp' back to 0 once the last team increments
-    // this counter.
-    unsigned val = atomicInc(timestamp, NumTeams - 1);
-    IsLastTeam = val == NumTeams - 1;
-  }
-
-  // We have to wait on L1 barrier because in GENERIC mode the workers
-  // are waiting on barrier 0 for work.
-  //
-  // If we guard this barrier as follows it leads to deadlock, probably
-  // because of a compiler bug: if (!IsGenericMode()) __syncthreads();
-  uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE;
-  named_sync(L1_BARRIER, SyncWarps * WARPSIZE);
-
-  // If this team is not the last, quit.
-  if (/* Volatile read by all threads */ !IsLastTeam)
-    return 0;
-
-    //
-    // Last team processing.
-    //
-
-    // Threads in excess of #teams do not participate in reduction of the
-    // scratchpad values.
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  uint32_t ActiveThreads = NumThreads;
-  if (NumTeams < NumThreads) {
-    ActiveThreads =
-        (NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1);
-  }
-  if (ThreadId >= ActiveThreads)
-    return 0;
-
-  // Load from scratchpad and reduce.
-  char *scratchpad = GetTeamsReductionScratchpad();
-  ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
-  for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads)
-    ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
-
-  uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
-  uint32_t WarpId = ThreadId / WARPSIZE;
-
-  // Reduce across warps to the warp master.
-  if ((ActiveThreads % WARPSIZE == 0) ||
-      (WarpId < WarpsNeeded - 1)) // Full warp
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else if (ActiveThreads > 1) // Partial warp but contiguous lanes
-    // Only SPMD execution mode comes thru this case.
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/ActiveThreads % WARPSIZE,
-                              /*LaneId=*/ThreadId % WARPSIZE);
-
-  // When we have more than [warpsize] number of threads
-  // a block reduction is performed here.
-  if (ActiveThreads > WARPSIZE) {
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
-
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
-  }
-#else
-  if (ThreadId >= NumTeams)
-    return 0;
-
-  // Load from scratchpad and reduce.
-  char *scratchpad = GetTeamsReductionScratchpad();
-  ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
-  for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
-    ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
-
-  // Reduce across warps to the warp master.
-  __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
-  if (Liveness == __kmpc_impl_all_lanes) // Full warp
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else // Partial warp but contiguous lanes
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/__kmpc_impl_popc(Liveness),
-                              /*LaneId=*/ThreadId % WARPSIZE);
-
-  // When we have more than [warpsize] number of threads
-  // a block reduction is performed here.
-  uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads;
-  if (ActiveThreads > WARPSIZE) {
-    uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
-
-    uint32_t WarpId = ThreadId / WARPSIZE;
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
-  }
-#endif // __CUDA_ARCH__ >= 700
-
-  return ThreadId == 0;
-}
-
-EXTERN
-int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
-                                         size_t reduce_size, void *reduce_data,
-                                         kmp_ShuffleReductFctPtr shflFct,
-                                         kmp_InterWarpCopyFctPtr cpyFct,
-                                         kmp_CopyToScratchpadFctPtr scratchFct,
-                                         kmp_LoadReduceFctPtr ldFct) {
-  return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
-                                   reduce_data, shflFct, cpyFct, scratchFct,
-                                   ldFct, isSPMDMode());
-}
-
-EXTERN
-int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
-    kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
-  return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
-                                   reduce_data, shflFct, cpyFct, scratchFct,
-                                   ldFct, /*isSPMDExecutionMode=*/true);
-}
-
-EXTERN
-int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
-    kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
-  return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
-                                   reduce_data, shflFct, cpyFct, scratchFct,
-                                   ldFct, /*isSPMDExecutionMode=*/false);
-}
-
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
-                                                       int32_t global_tid,
-                                                       kmp_CriticalName *crit) {
-  if (checkSPMDMode(loc) && GetThreadIdInBlock() != 0)
-    return 0;
-  // The master thread of the team actually does the reduction.
-  while (atomicCAS((uint32_t *)crit, 0, 1))
-    ;
-  return 1;
-}
-
-EXTERN void
-__kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, int32_t global_tid,
-                                            kmp_CriticalName *crit) {
-  __threadfence_system();
-  (void)atomicExch((uint32_t *)crit, 0);
-}
-
 INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
   return checkGenericMode(loc) || IsTeamMaster(ThreadId);
 }
@@ -403,8 +183,10 @@ INLINE static uint32_t roundToWarpsize(uint32_t s) {
   return (s & ~(unsigned)(WARPSIZE - 1));
 }
 
-__device__ static volatile uint32_t IterCnt = 0;
-__device__ static volatile uint32_t Cnt = 0;
+INLINE static uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
+
+DEVICE static volatile uint32_t IterCnt = 0;
+DEVICE static volatile uint32_t Cnt = 0;
 EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
     kmp_Ident *loc, int32_t global_tid, void *global_buffer,
     int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
@@ -426,8 +208,8 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
                          : /*Master thread only*/ 1;
   uint32_t TeamId = GetBlockIdInKernel();
   uint32_t NumTeams = GetNumberOfBlocksInKernel();
-  __shared__ unsigned Bound;
-  __shared__ unsigned ChunkTeamCount;
+  static SHARED unsigned Bound;
+  static SHARED unsigned ChunkTeamCount;
 
   // Block progress for teams greater than the current upper
   // limit. We always only allow a number of teams less or equal
@@ -435,7 +217,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
   bool IsMaster = isMaster(loc, ThreadId);
   while (IsMaster) {
     // Atomic read
-    Bound = atomicAdd((uint32_t *)&IterCnt, 0);
+    Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u);
     if (TeamId < Bound + num_of_records)
       break;
   }
@@ -446,12 +228,12 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
       lgcpyFct(global_buffer, ModBockId, reduce_data);
     else
       lgredFct(global_buffer, ModBockId, reduce_data);
-    __threadfence_system();
+    __kmpc_impl_threadfence_system();
 
     // Increment team counter.
     // This counter is incremented by all teams in the current
     // BUFFER_SIZE chunk.
-    ChunkTeamCount = atomicInc((uint32_t *)&Cnt, num_of_records - 1);
+    ChunkTeamCount = __kmpc_atomic_inc((uint32_t *)&Cnt, num_of_records - 1u);
   }
   // Synchronize
   if (checkSPMDMode(loc))
@@ -480,14 +262,14 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
   //         by returning 1 in the thread holding the reduction result.
 
   // Check if this is the very last team.
-  unsigned NumRecs = min(NumTeams, num_of_records);
+  unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
   if (ChunkTeamCount == NumTeams - Bound - 1) {
     //
     // Last team processing.
     //
     if (ThreadId >= NumRecs)
       return 0;
-    NumThreads = roundToWarpsize(min(NumThreads, NumRecs));
+    NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
     if (ThreadId >= NumThreads)
       return 0;
 
@@ -502,7 +284,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
 
       // When we have more than [warpsize] number of threads
       // a block reduction is performed here.
-      uint32_t ActiveThreads = min(NumRecs, NumThreads);
+      uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
       if (ActiveThreads > WARPSIZE) {
         uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
         // Gather all the reduced values from each warp
@@ -526,9 +308,10 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
   if (IsMaster && ChunkTeamCount == num_of_records - 1) {
     // Allow SIZE number of teams to proceed writing their
     // intermediate results to the global buffer.
-    atomicAdd((uint32_t *)&IterCnt, num_of_records);
+    __kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records));
   }
 
   return 0;
 }
 
+#pragma omp end declare target
diff --git a/libomptarget/deviceRTLs/nvptx/src/supporti.h b/libomptarget/deviceRTLs/common/src/support.cu
similarity index 72%
rename from libomptarget/deviceRTLs/nvptx/src/supporti.h
rename to libomptarget/deviceRTLs/common/src/support.cu
index d4da6ad73..f9b4ad300 100644
--- a/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/libomptarget/deviceRTLs/common/src/support.cu
@@ -1,4 +1,4 @@
-//===--------- supporti.h - NVPTX OpenMP support functions ------- CUDA -*-===//
+//===--------- support.cu - GPU OpenMP support functions --------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,27 +9,30 @@
 // Wrapper implementation to some functions natively supported by the GPU.
 //
 //===----------------------------------------------------------------------===//
+#pragma omp declare target
+
+#include "common/support.h"
+#include "common/debug.h"
+#include "common/omptarget.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 // Execution Parameters
 ////////////////////////////////////////////////////////////////////////////////
 
-#include "target_impl.h"
-
-INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
+DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
   execution_param = EMode;
   execution_param |= RMode;
 }
 
-INLINE bool isGenericMode() { return (execution_param & ModeMask) == Generic; }
+DEVICE bool isGenericMode() { return (execution_param & ModeMask) == Generic; }
 
-INLINE bool isSPMDMode() { return (execution_param & ModeMask) == Spmd; }
+DEVICE bool isSPMDMode() { return (execution_param & ModeMask) == Spmd; }
 
-INLINE bool isRuntimeUninitialized() {
+DEVICE bool isRuntimeUninitialized() {
   return (execution_param & RuntimeMask) == RuntimeUninitialized;
 }
 
-INLINE bool isRuntimeInitialized() {
+DEVICE bool isRuntimeInitialized() {
   return (execution_param & RuntimeMask) == RuntimeInitialized;
 }
 
@@ -37,7 +40,7 @@ INLINE bool isRuntimeInitialized() {
 // Execution Modes based on location parameter fields
 ////////////////////////////////////////////////////////////////////////////////
 
-INLINE bool checkSPMDMode(kmp_Ident *loc) {
+DEVICE bool checkSPMDMode(kmp_Ident *loc) {
   if (!loc)
     return isSPMDMode();
 
@@ -55,11 +58,11 @@ INLINE bool checkSPMDMode(kmp_Ident *loc) {
   return isSPMDMode();
 }
 
-INLINE bool checkGenericMode(kmp_Ident *loc) {
+DEVICE bool checkGenericMode(kmp_Ident *loc) {
   return !checkSPMDMode(loc);
 }
 
-INLINE bool checkRuntimeUninitialized(kmp_Ident *loc) {
+DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc) {
   if (!loc)
     return isRuntimeUninitialized();
 
@@ -82,7 +85,7 @@ INLINE bool checkRuntimeUninitialized(kmp_Ident *loc) {
   return isRuntimeUninitialized();
 }
 
-INLINE bool checkRuntimeInitialized(kmp_Ident *loc) {
+DEVICE bool checkRuntimeInitialized(kmp_Ident *loc) {
   return !checkRuntimeUninitialized(loc);
 }
 
@@ -90,24 +93,6 @@ INLINE bool checkRuntimeInitialized(kmp_Ident *loc) {
 // support: get info from machine
 ////////////////////////////////////////////////////////////////////////////////
 
-////////////////////////////////////////////////////////////////////////////////
-//
-// Calls to the NVPTX layer  (assuming 1D layout)
-//
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE int GetThreadIdInBlock() { return threadIdx.x; }
-
-INLINE int GetBlockIdInKernel() { return blockIdx.x; }
-
-INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; }
-
-INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
-
-INLINE unsigned GetWarpId() { return threadIdx.x / WARPSIZE; }
-
-INLINE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); }
-
 ////////////////////////////////////////////////////////////////////////////////
 //
 // Calls to the Generic Scheme Implementation Layer (assuming 1D layout)
@@ -122,11 +107,11 @@ INLINE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); }
 //      If NumThreads is 1024, master id is 992.
 //
 // Called in Generic Execution Mode only.
-INLINE int GetMasterThreadID() { return (blockDim.x - 1) & ~(WARPSIZE - 1); }
+DEVICE int GetMasterThreadID() { return (GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1); }
 
 // The last warp is reserved for the master; other warps are workers.
 // Called in Generic Execution Mode only.
-INLINE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
+DEVICE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
 
 ////////////////////////////////////////////////////////////////////////////////
 // get thread id in team
@@ -135,7 +120,7 @@ INLINE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
 // or a serial region by the master.  If the master (whose CUDA thread
 // id is GetMasterThreadID()) calls this routine, we return 0 because
 // it is a shadow for the first worker.
-INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
+DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
   // Implemented using control flow (predication) instead of with a modulo
   // operation.
   int tid = GetThreadIdInBlock();
@@ -151,7 +136,7 @@ INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
+DEVICE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
   // omp_thread_num
   int rc;
   if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) {
@@ -167,7 +152,7 @@ INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
   return rc;
 }
 
-INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) {
+DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) {
   // omp_num_threads
   int rc;
   int Level = parallelLevel[GetWarpId()];
@@ -185,12 +170,12 @@ INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) {
 ////////////////////////////////////////////////////////////////////////////////
 // Team id linked to OpenMP
 
-INLINE int GetOmpTeamId() {
+DEVICE int GetOmpTeamId() {
   // omp_team_num
   return GetBlockIdInKernel(); // assume 1 block per team
 }
 
-INLINE int GetNumberOfOmpTeams() {
+DEVICE int GetNumberOfOmpTeams() {
   // omp_num_teams
   return GetNumberOfBlocksInKernel(); // assume 1 block per team
 }
@@ -198,31 +183,31 @@ INLINE int GetNumberOfOmpTeams() {
 ////////////////////////////////////////////////////////////////////////////////
 // Masters
 
-INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
+DEVICE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Parallel level
 
-INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
+DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
   __kmpc_impl_syncwarp(Mask);
   __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
   unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
   if (Rank == 0) {
     parallelLevel[GetWarpId()] +=
         (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
-    __threadfence();
+    __kmpc_impl_threadfence();
   }
   __kmpc_impl_syncwarp(Mask);
 }
 
-INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
+DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
   __kmpc_impl_syncwarp(Mask);
   __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
   unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
   if (Rank == 0) {
     parallelLevel[GetWarpId()] -=
         (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
-    __threadfence();
+    __kmpc_impl_threadfence();
   }
   __kmpc_impl_syncwarp(Mask);
 }
@@ -231,13 +216,13 @@ INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
 // get OpenMP number of procs
 
 // Get the number of processors in the device.
-INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) {
+DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) {
   if (!isSPMDExecutionMode)
     return GetNumberOfWorkersInTeam();
   return GetNumberOfThreadsInBlock();
 }
 
-INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) {
+DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) {
   return GetNumberOfProcsInDevice(isSPMDExecutionMode);
 }
 
@@ -245,7 +230,7 @@ INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) {
 // Memory
 ////////////////////////////////////////////////////////////////////////////////
 
-INLINE unsigned long PadBytes(unsigned long size,
+DEVICE unsigned long PadBytes(unsigned long size,
                               unsigned long alignment) // must be a power of 2
 {
   // compute the necessary padding to satisfy alignment constraint
@@ -254,43 +239,30 @@ INLINE unsigned long PadBytes(unsigned long size,
   return (~(unsigned long)size + 1) & (alignment - 1);
 }
 
-INLINE void *SafeMalloc(size_t size, const char *msg) // check if success
+DEVICE void *SafeMalloc(size_t size, const char *msg) // check if success
 {
-  void *ptr = malloc(size);
+  void *ptr = __kmpc_impl_malloc(size);
   PRINT(LD_MEM, "malloc data of size %llu for %s: 0x%llx\n",
         (unsigned long long)size, msg, (unsigned long long)ptr);
   return ptr;
 }
 
-INLINE void *SafeFree(void *ptr, const char *msg) {
+DEVICE void *SafeFree(void *ptr, const char *msg) {
   PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", (unsigned long long)ptr, msg);
-  free(ptr);
+  __kmpc_impl_free(ptr);
   return NULL;
 }
 
-////////////////////////////////////////////////////////////////////////////////
-// Named Barrier Routines
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE void named_sync(const int barrier, const int num_threads) {
-  asm volatile("bar.sync %0, %1;"
-               :
-               : "r"(barrier), "r"(num_threads)
-               : "memory");
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // Teams Reduction Scratchpad Helpers
 ////////////////////////////////////////////////////////////////////////////////
 
-INLINE unsigned int *GetTeamsReductionTimestamp() {
+DEVICE unsigned int *GetTeamsReductionTimestamp() {
   return static_cast<unsigned int *>(ReductionScratchpadPtr);
 }
 
-INLINE char *GetTeamsReductionScratchpad() {
+DEVICE char *GetTeamsReductionScratchpad() {
   return static_cast<char *>(ReductionScratchpadPtr) + 256;
 }
 
-INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr) {
-  ReductionScratchpadPtr = ScratchpadPtr;
-}
+#pragma omp end declare target
diff --git a/libomptarget/deviceRTLs/nvptx/src/sync.cu b/libomptarget/deviceRTLs/common/src/sync.cu
similarity index 82%
rename from libomptarget/deviceRTLs/nvptx/src/sync.cu
rename to libomptarget/deviceRTLs/common/src/sync.cu
index 28a541901..0a00f2fa0 100644
--- a/libomptarget/deviceRTLs/nvptx/src/sync.cu
+++ b/libomptarget/deviceRTLs/common/src/sync.cu
@@ -1,4 +1,4 @@
-//===------------ sync.h - NVPTX OpenMP synchronizations --------- CUDA -*-===//
+//===------------ sync.cu - GPU OpenMP synchronizations ---------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,8 +9,9 @@
 // Include all synchronization.
 //
 //===----------------------------------------------------------------------===//
+#pragma omp declare target
 
-#include "omptarget-nvptx.h"
+#include "common/omptarget.h"
 #include "target_impl.h"
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -60,8 +61,7 @@ EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
         PRINT(LD_SYNC,
               "call kmpc_barrier with %d omp threads, sync parameter %d\n",
               (int)numberOfActiveOMPThreads, (int)threads);
-        // Barrier #1 is for synchronization among active threads.
-        named_sync(L1_BARRIER, threads);
+        __kmpc_impl_named_sync(threads);
       }
     } else {
       // Still need to flush the memory per the standard.
@@ -79,23 +79,6 @@ EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) {
   PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n");
 }
 
-// Emit a simple barrier call in Generic mode.  Assumes the caller is in an L0
-// parallel region and that all worker threads participate.
-EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) {
-  int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - WARPSIZE;
-  // The #threads parameter must be rounded up to the WARPSIZE.
-  int threads =
-      WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
-
-  PRINT(LD_SYNC,
-        "call kmpc_barrier_simple_generic with %d omp threads, sync parameter "
-        "%d\n",
-        (int)numberOfActiveOMPThreads, (int)threads);
-  // Barrier #1 is for synchronization among active threads.
-  named_sync(L1_BARRIER, threads);
-  PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n");
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // KMP MASTER
 ////////////////////////////////////////////////////////////////////////////////
@@ -124,7 +107,7 @@ EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) {
   PRINT0(LD_IO, "call kmpc_end_single\n");
   // decide to implement single with master: master get the single
   ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
-  // sync barrier is explicitely called... so that is not a problem
+  // sync barrier is explicitly called... so that is not a problem
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -133,7 +116,7 @@ EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) {
 
 EXTERN void __kmpc_flush(kmp_Ident *loc) {
   PRINT0(LD_IO, "call kmpc_flush\n");
-  __threadfence();
+  __kmpc_impl_threadfence();
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -153,3 +136,5 @@ EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) {
   PRINT0(LD_IO, "call __kmpc_syncwarp\n");
   __kmpc_impl_syncwarp(Mask);
 }
+
+#pragma omp end declare target
diff --git a/libomptarget/deviceRTLs/nvptx/src/task.cu b/libomptarget/deviceRTLs/common/src/task.cu
similarity index 97%
rename from libomptarget/deviceRTLs/nvptx/src/task.cu
rename to libomptarget/deviceRTLs/common/src/task.cu
index d618ff16d..f28cd2b81 100644
--- a/libomptarget/deviceRTLs/nvptx/src/task.cu
+++ b/libomptarget/deviceRTLs/common/src/task.cu
@@ -26,8 +26,9 @@
 //    - end
 //
 //===----------------------------------------------------------------------===//
+#pragma omp declare target
 
-#include "omptarget-nvptx.h"
+#include "common/omptarget.h"
 
 EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(
     kmp_Ident *loc,     // unused
@@ -84,7 +85,7 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
         P64(newKmpTaskDescr));
   ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
           "Runtime must be initialized.");
-  // 1. get explict task descr from kmp task descr
+  // 1. get explicit task descr from kmp task descr
   omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
       (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
           newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
@@ -123,7 +124,7 @@ EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
         (unsigned long long)newKmpTaskDescr);
   ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
           "Runtime must be initialized.");
-  // 1. get explict task descr from kmp task descr
+  // 1. get explicit task descr from kmp task descr
   omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
       (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
           newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
@@ -149,7 +150,7 @@ EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
         (unsigned long long)newKmpTaskDescr);
   ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
           "Runtime must be initialized.");
-  // 1. get explict task descr from kmp task descr
+  // 1. get explicit task descr from kmp task descr
   omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
       (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
           newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
@@ -214,3 +215,5 @@ EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid,
 
   __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0);
 }
+
+#pragma omp end declare target
diff --git a/libomptarget/deviceRTLs/nvptx/src/state-queue.h b/libomptarget/deviceRTLs/common/state-queue.h
similarity index 97%
rename from libomptarget/deviceRTLs/nvptx/src/state-queue.h
rename to libomptarget/deviceRTLs/common/state-queue.h
index 9d7576bcd..8320929cf 100644
--- a/libomptarget/deviceRTLs/nvptx/src/state-queue.h
+++ b/libomptarget/deviceRTLs/common/state-queue.h
@@ -21,7 +21,7 @@
 
 #include <stdint.h>
 
-#include "option.h" // choices we have
+#include "target_impl.h"
 
 template <typename ElementType, uint32_t SIZE> class omptarget_nvptx_Queue {
 private:
diff --git a/libomptarget/deviceRTLs/nvptx/src/state-queuei.h b/libomptarget/deviceRTLs/common/state-queuei.h
similarity index 83%
rename from libomptarget/deviceRTLs/nvptx/src/state-queuei.h
rename to libomptarget/deviceRTLs/common/state-queuei.h
index 3c3be113e..902eff903 100644
--- a/libomptarget/deviceRTLs/nvptx/src/state-queuei.h
+++ b/libomptarget/deviceRTLs/common/state-queuei.h
@@ -1,4 +1,4 @@
-//===------- state-queue.cu - NVPTX OpenMP GPU State Queue ------- CUDA -*-===//
+//===------- state-queuei.h - OpenMP GPU State Queue ------------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -20,12 +20,12 @@
 
 template <typename ElementType, uint32_t SIZE>
 INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ENQUEUE_TICKET() {
-  return atomicAdd((unsigned int *)&tail, 1);
+  return __kmpc_atomic_add((unsigned int *)&tail, 1u);
 }
 
 template <typename ElementType, uint32_t SIZE>
 INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::DEQUEUE_TICKET() {
-  return atomicAdd((unsigned int *)&head, 1);
+  return __kmpc_atomic_add((unsigned int *)&head, 1u);
 }
 
 template <typename ElementType, uint32_t SIZE>
@@ -37,28 +37,28 @@ omptarget_nvptx_Queue<ElementType, SIZE>::ID(uint32_t ticket) {
 template <typename ElementType, uint32_t SIZE>
 INLINE bool omptarget_nvptx_Queue<ElementType, SIZE>::IsServing(uint32_t slot,
                                                                 uint32_t id) {
-  return atomicAdd((unsigned int *)&ids[slot], 0) == id;
+  return __kmpc_atomic_add((unsigned int *)&ids[slot], 0u) == id;
 }
 
 template <typename ElementType, uint32_t SIZE>
 INLINE void
 omptarget_nvptx_Queue<ElementType, SIZE>::PushElement(uint32_t slot,
                                                       ElementType *element) {
-  atomicExch((unsigned long long *)&elementQueue[slot],
-             (unsigned long long)element);
+  __kmpc_atomic_exchange((unsigned long long *)&elementQueue[slot],
+                         (unsigned long long)element);
 }
 
 template <typename ElementType, uint32_t SIZE>
 INLINE ElementType *
 omptarget_nvptx_Queue<ElementType, SIZE>::PopElement(uint32_t slot) {
-  return (ElementType *)atomicAdd((unsigned long long *)&elementQueue[slot],
-                                  (unsigned long long)0);
+  return (ElementType *)__kmpc_atomic_add(
+      (unsigned long long *)&elementQueue[slot], (unsigned long long)0);
 }
 
 template <typename ElementType, uint32_t SIZE>
 INLINE void omptarget_nvptx_Queue<ElementType, SIZE>::DoneServing(uint32_t slot,
                                                                   uint32_t id) {
-  atomicExch((unsigned int *)&ids[slot], (id + 1) % MAX_ID);
+  __kmpc_atomic_exchange((unsigned int *)&ids[slot], (id + 1) % MAX_ID);
 }
 
 template <typename ElementType, uint32_t SIZE>
diff --git a/libomptarget/deviceRTLs/nvptx/src/support.h b/libomptarget/deviceRTLs/common/support.h
similarity index 58%
rename from libomptarget/deviceRTLs/nvptx/src/support.h
rename to libomptarget/deviceRTLs/common/support.h
index e10f2a19d..a46432825 100644
--- a/libomptarget/deviceRTLs/nvptx/src/support.h
+++ b/libomptarget/deviceRTLs/common/support.h
@@ -1,4 +1,4 @@
-//===--------- support.h - NVPTX OpenMP support functions -------- CUDA -*-===//
+//===--------- support.h - OpenMP GPU support functions ---------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,13 +10,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef OMPTARGET_SUPPORT_H
+#define OMPTARGET_SUPPORT_H
+
+#include "interface.h"
 #include "target_impl.h"
+
 ////////////////////////////////////////////////////////////////////////////////
 // Execution Parameters
 ////////////////////////////////////////////////////////////////////////////////
 enum ExecutionMode {
-  Generic = 0x00u,
-  Spmd = 0x01u,
+  Spmd = 0x00u,
+  Generic = 0x01u,
   ModeMask = 0x01u,
 };
 
@@ -26,71 +31,68 @@ enum RuntimeMode {
   RuntimeMask = 0x02u,
 };
 
-INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
-INLINE bool isGenericMode();
-INLINE bool isSPMDMode();
-INLINE bool isRuntimeUninitialized();
-INLINE bool isRuntimeInitialized();
+DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
+DEVICE bool isGenericMode();
+DEVICE bool isSPMDMode();
+DEVICE bool isRuntimeUninitialized();
+DEVICE bool isRuntimeInitialized();
 
 ////////////////////////////////////////////////////////////////////////////////
-// get info from machine
+// Execution Modes based on location parameter fields
 ////////////////////////////////////////////////////////////////////////////////
 
-// get low level ids of resources
-INLINE int GetThreadIdInBlock();
-INLINE int GetBlockIdInKernel();
-INLINE int GetNumberOfBlocksInKernel();
-INLINE int GetNumberOfThreadsInBlock();
-INLINE unsigned GetWarpId();
-INLINE unsigned GetLaneId();
+DEVICE bool checkSPMDMode(kmp_Ident *loc);
+DEVICE bool checkGenericMode(kmp_Ident *loc);
+DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc);
+DEVICE bool checkRuntimeInitialized(kmp_Ident *loc);
+
+////////////////////////////////////////////////////////////////////////////////
+// get info from machine
+////////////////////////////////////////////////////////////////////////////////
 
 // get global ids to locate tread/team info (constant regardless of OMP)
-INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
-INLINE int GetMasterThreadID();
-INLINE int GetNumberOfWorkersInTeam();
+DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
+DEVICE int GetMasterThreadID();
+DEVICE int GetNumberOfWorkersInTeam();
 
 // get OpenMP thread and team ids
-INLINE int GetOmpThreadId(int threadId,
+DEVICE int GetOmpThreadId(int threadId,
                           bool isSPMDExecutionMode);    // omp_thread_num
-INLINE int GetOmpTeamId();                              // omp_team_num
+DEVICE int GetOmpTeamId();                              // omp_team_num
 
 // get OpenMP number of threads and team
-INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
-INLINE int GetNumberOfOmpTeams();                           // omp_num_teams
+DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
+DEVICE int GetNumberOfOmpTeams();                           // omp_num_teams
 
 // get OpenMP number of procs
-INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
-INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
+DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
+DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
 
 // masters
-INLINE int IsTeamMaster(int ompThreadId);
+DEVICE int IsTeamMaster(int ompThreadId);
 
 // Parallel level
-INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
-INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
+DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
+DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Memory
 ////////////////////////////////////////////////////////////////////////////////
 
 // safe alloc and free
-INLINE void *SafeMalloc(size_t size, const char *msg); // check if success
-INLINE void *SafeFree(void *ptr, const char *msg);
+DEVICE void *SafeMalloc(size_t size, const char *msg); // check if success
+DEVICE void *SafeFree(void *ptr, const char *msg);
 // pad to a alignment (power of 2 only)
-INLINE unsigned long PadBytes(unsigned long size, unsigned long alignment);
+DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment);
 #define ADD_BYTES(_addr, _bytes)                                               \
   ((void *)((char *)((void *)(_addr)) + (_bytes)))
 #define SUB_BYTES(_addr, _bytes)                                               \
   ((void *)((char *)((void *)(_addr)) - (_bytes)))
 
-////////////////////////////////////////////////////////////////////////////////
-// Named Barrier Routines
-////////////////////////////////////////////////////////////////////////////////
-INLINE void named_sync(const int barrier, const int num_threads);
-
 ////////////////////////////////////////////////////////////////////////////////
 // Teams Reduction Scratchpad Helpers
 ////////////////////////////////////////////////////////////////////////////////
-INLINE unsigned int *GetTeamsReductionTimestamp();
-INLINE char *GetTeamsReductionScratchpad();
-INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
+DEVICE unsigned int *GetTeamsReductionTimestamp();
+DEVICE char *GetTeamsReductionScratchpad();
+
+#endif
diff --git a/libomptarget/deviceRTLs/interface.h b/libomptarget/deviceRTLs/interface.h
index 0f0f43ebb..5f539bc3f 100644
--- a/libomptarget/deviceRTLs/interface.h
+++ b/libomptarget/deviceRTLs/interface.h
@@ -16,8 +16,12 @@
 #ifndef _INTERFACES_H_
 #define _INTERFACES_H_
 
+#include <stddef.h>
 #include <stdint.h>
 
+#ifdef __AMDGCN__
+#include "amdgcn/src/amdgcn_interface.h"
+#endif
 #ifdef __CUDACC__
 #include "nvptx/src/nvptx_interface.h"
 #endif
@@ -26,7 +30,6 @@
 // OpenMP interface
 ////////////////////////////////////////////////////////////////////////////////
 
-typedef uint32_t omp_lock_t;      /* arbitrary type of the right length */
 typedef uint64_t omp_nest_lock_t; /* arbitrary type of the right length */
 
 typedef enum omp_sched_t {
@@ -95,7 +98,7 @@ EXTERN int omp_get_max_task_priority(void);
 ////////////////////////////////////////////////////////////////////////////////
 
 ////////////////////////////////////////////////////////////////////////////////
-// kmp specifc types
+// kmp specific types
 ////////////////////////////////////////////////////////////////////////////////
 
 typedef enum kmp_sched_t {
@@ -190,17 +193,10 @@ typedef struct ident {
 
 // parallel defs
 typedef ident_t kmp_Ident;
-typedef void (*kmp_ParFctPtr)(int32_t *global_tid, int32_t *bound_tid, ...);
-typedef void (*kmp_ReductFctPtr)(void *lhsData, void *rhsData);
 typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num);
 typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id,
                                         int16_t lane_offset,
                                         int16_t shortCircuit);
-typedef void (*kmp_CopyToScratchpadFctPtr)(void *reduceData, void *scratchpad,
-                                           int32_t index, int32_t width);
-typedef void (*kmp_LoadReduceFctPtr)(void *reduceData, void *scratchpad,
-                                     int32_t index, int32_t width,
-                                     int32_t reduce);
 typedef void (*kmp_ListGlobalFctPtr)(void *buffer, int idx, void *reduce_data);
 
 // task defs
@@ -224,12 +220,6 @@ typedef int32_t kmp_CriticalName[8];
 EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc);
 EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t global_tid,
                                     int32_t num_threads);
-// simd
-EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t global_tid,
-                                   int32_t simd_limit);
-// aee ... not supported
-// EXTERN void __kmpc_fork_call(kmp_Ident *loc, int32_t argc, kmp_ParFctPtr
-// microtask, ...);
 EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid);
 EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
                                            uint32_t global_tid);
@@ -351,61 +341,25 @@ EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t global_tid);
 EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid);
 EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid);
 
-// Support for reducing conditional lastprivate variables
-EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc,
-                                                  int32_t global_tid,
-                                                  int32_t varNum, void *array);
-
 // reduction
 EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid);
 EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
-EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
 EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
     kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
     void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
     kmp_InterWarpCopyFctPtr cpyFct);
-EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
-EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
-EXTERN int32_t __kmpc_nvptx_simd_reduce_nowait(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
 EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
     kmp_Ident *loc, int32_t global_tid, void *global_buffer,
     int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
     kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
     kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
     kmp_ListGlobalFctPtr glredFct);
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
-    kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
-    kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
-    kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
-                                                       int32_t global_tid,
-                                                       kmp_CriticalName *crit);
-EXTERN void __kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc,
-                                                        int32_t global_tid,
-                                                        kmp_CriticalName *crit);
 EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
 EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
 
 // sync barrier
 EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid);
 EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid);
-EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid);
 EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid);
 
 // single
@@ -465,29 +419,14 @@ EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
                              int32_t cancelVal);
 
 // non standard
-EXTERN void __kmpc_kernel_init_params(void *ReductionScratchpadPtr);
 EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime);
 EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
-EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
-                                    int16_t RequiresDataSharing);
-EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit();
+EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit,
+                                    int16_t RequiresOMPRuntime);
 EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
-EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
-                                           int16_t IsOMPRuntimeInitialized);
-EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
-                                   int16_t IsOMPRuntimeInitialized);
+EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn);
+EXTERN bool __kmpc_kernel_parallel(void **WorkFn);
 EXTERN void __kmpc_kernel_end_parallel();
-EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer,
-                                              __kmpc_impl_lanemask_t Mask,
-                                              bool *IsFinal,
-                                              int32_t *LaneSource);
-EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer);
-EXTERN bool __kmpc_kernel_convergent_simd(void *buffer,
-                                          __kmpc_impl_lanemask_t Mask,
-                                          bool *IsFinal, int32_t *LaneSource,
-                                          int32_t *LaneId, int32_t *NumLanes);
-EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
-
 
 EXTERN void __kmpc_data_sharing_init_stack();
 EXTERN void __kmpc_data_sharing_init_stack_spmd();
@@ -499,33 +438,6 @@ EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
 EXTERN void __kmpc_end_sharing_variables();
 EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs);
 
-// The slot used for data sharing by the master and worker threads. We use a
-// complete (default size version and an incomplete one so that we allow sizes
-// greater than the default).
-struct __kmpc_data_sharing_slot {
-  __kmpc_data_sharing_slot *Next;
-  __kmpc_data_sharing_slot *Prev;
-  void *PrevSlotStackPtr;
-  void *DataEnd;
-  char Data[];
-};
-EXTERN void
-__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *RootS,
-                                           size_t InitialDataSize);
-EXTERN void *__kmpc_data_sharing_environment_begin(
-    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
-    void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
-    size_t SharingDataSize, size_t SharingDefaultDataSize,
-    int16_t IsOMPRuntimeInitialized);
-EXTERN void __kmpc_data_sharing_environment_end(
-    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
-    void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
-    int32_t IsEntryPoint);
-
-EXTERN void *
-__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
-                                          int16_t IsOMPRuntimeInitialized);
-
 // SPMD execution mode interrogation function.
 EXTERN int8_t __kmpc_is_spmd_exec_mode();
 
diff --git a/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
index 1cd13c503..78d3753be 100644
--- a/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -10,60 +10,23 @@
 #
 ##===----------------------------------------------------------------------===##
 
-set(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER "" CACHE STRING
-  "Path to alternate NVCC host compiler to be used by the NVPTX device RTL.")
-
-if(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER)
-  find_program(ALTERNATE_CUDA_HOST_COMPILER NAMES ${LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER})
-  if(NOT ALTERNATE_CUDA_HOST_COMPILER)
-    libomptarget_say("Not building CUDA offloading device RTL: invalid NVPTX alternate host compiler.")
-  endif()
-  set(CUDA_HOST_COMPILER ${ALTERNATE_CUDA_HOST_COMPILER} CACHE FILEPATH "" FORCE)
-endif()
-
-# We can't use clang as nvcc host preprocessor, so we attempt to replace it with
-# gcc.
-if(CUDA_HOST_COMPILER MATCHES clang)
-
-  find_program(LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER NAMES gcc)
-
-  if(NOT LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER)
-    libomptarget_say("Not building CUDA offloading device RTL: clang is not supported as NVCC host compiler.")
-    libomptarget_say("Please include gcc in your path or set LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER to the full path of of valid compiler.")
-    return()
-  endif()
-  set(CUDA_HOST_COMPILER "${LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER}" CACHE FILEPATH "" FORCE)
-endif()
-
 get_filename_component(devicertl_base_directory
   ${CMAKE_CURRENT_SOURCE_DIR}
   DIRECTORY)
+set(devicertl_common_directory
+  ${devicertl_base_directory}/common)
+set(devicertl_nvptx_directory
+  ${devicertl_base_directory}/nvptx)
 
 if(LIBOMPTARGET_DEP_CUDA_FOUND)
-  libomptarget_say("Building CUDA offloading device RTL.")
-
-  # We really don't have any host code, so we don't need to care about
-  # propagating host flags.
-  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-
-  set(cuda_src_files
-      src/cancel.cu
-      src/critical.cu
-      src/data_sharing.cu
-      src/libcall.cu
-      src/loop.cu
-      src/omptarget-nvptx.cu
-      src/parallel.cu
-      src/reduction.cu
-      src/sync.cu
-      src/task.cu
-  )
-
-  set(omp_data_objects src/omp_data.cu)
-
-  # Get the compute capability the user requested or use SM_35 by default.
-  # SM_35 is what clang uses by default.
-  set(default_capabilities 35)
+  # Build library support for the highest compute capability the system supports
+  # and always build support for sm_35 by default
+  if (${LIBOMPTARGET_DEP_CUDA_ARCH} EQUAL 35)
+    set(default_capabilities 35)
+  else()
+      set(default_capabilities "35,${LIBOMPTARGET_DEP_CUDA_ARCH}")
+  endif()
+
   if (DEFINED LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY)
     set(default_capabilities ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY})
     libomptarget_warning_say("LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY is deprecated, please use LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES")
@@ -76,26 +39,14 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
     set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm})
   endforeach()
 
+  # Override default MAX_SM in src/target_impl.h if requested
+  if (DEFINED LIBOMPTARGET_NVPTX_MAX_SM)
+    set(MAX_SM_DEFINITION "-DMAX_SM=${LIBOMPTARGET_NVPTX_MAX_SM}")
+  endif()
+
   # Activate RTL message dumps if requested by the user.
   set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
     "Activate NVPTX device RTL debug messages.")
-  if(${LIBOMPTARGET_NVPTX_DEBUG})
-    set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1 -g --ptxas-options=-v)
-  endif()
-
-  # NVPTX runtime library has to be statically linked. Dynamic linking is not
-  # yet supported by the CUDA toolchain on the device.
-  set(BUILD_SHARED_LIBS OFF)
-  set(CUDA_SEPARABLE_COMPILATION ON)
-  list(APPEND CUDA_NVCC_FLAGS -I${devicertl_base_directory})
-  cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects}
-      OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
-
-  # Install device RTL under the lib destination folder.
-  install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-
-  target_link_libraries(omptarget-nvptx ${CUDA_LIBRARIES})
-
 
   # Check if we can create an LLVM bitcode implementation of the runtime library
   # that could be inlined in the user application. For that we need to find
@@ -108,36 +59,38 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
 
   include(LibomptargetNVPTXBitcodeLibrary)
 
-  set(bclib_default FALSE)
-  if (${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED})
-    set(bclib_default TRUE)
-  endif()
-  set(LIBOMPTARGET_NVPTX_ENABLE_BCLIB ${bclib_default} CACHE BOOL
-    "Enable CUDA LLVM bitcode offloading device RTL.")
-  if (${LIBOMPTARGET_NVPTX_ENABLE_BCLIB})
-    if (NOT ${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED})
-      libomptarget_error_say("Cannot build CUDA LLVM bitcode offloading device RTL!")
-    endif()
+  if (LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED)
     libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.")
 
+    set(cuda_src_files
+      ${devicertl_common_directory}/src/cancel.cu
+      ${devicertl_common_directory}/src/critical.cu
+      ${devicertl_common_directory}/src/data_sharing.cu
+      ${devicertl_common_directory}/src/libcall.cu
+      ${devicertl_common_directory}/src/loop.cu
+      ${devicertl_common_directory}/src/omp_data.cu
+      ${devicertl_common_directory}/src/omptarget.cu
+      ${devicertl_common_directory}/src/parallel.cu
+      ${devicertl_common_directory}/src/reduction.cu
+      ${devicertl_common_directory}/src/support.cu
+      ${devicertl_common_directory}/src/sync.cu
+      ${devicertl_common_directory}/src/task.cu
+      src/target_impl.cu
+    )
+
     # Set flags for LLVM Bitcode compilation.
     set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS}
-                 -I${devicertl_base_directory})
+                 -I${devicertl_base_directory}
+                 -I${devicertl_nvptx_directory}/src)
+
     if(${LIBOMPTARGET_NVPTX_DEBUG})
       set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1)
     else()
       set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0)
     endif()
 
-    # CUDA 9 header files use the nv_weak attribute which clang is not yet prepared
-    # to handle. Therefore, we use 'weak' instead. We are compiling only for the
-    # device, so it should be equivalent.
-    if(CUDA_VERSION_MAJOR GREATER 8)
-      set(bc_flags ${bc_flags} -Dnv_weak=weak)
-    endif()
-
     # Create target to build all Bitcode libraries.
-    add_custom_target(omptarget-nvptx-bc)
+    add_custom_target(bolt-omptarget-nvptx-bc)
 
     # Generate a Bitcode library for all the compute capabilities the user requested.
     foreach(sm ${nvptx_sm_list})
@@ -150,7 +103,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
         get_filename_component(outfile ${src} NAME)
 
         add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc
-          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch}
+          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} ${MAX_SM_DEFINITION}
             -c ${infile} -o ${outfile}-sm_${sm}.bc
           DEPENDS ${infile}
           IMPLICIT_DEPENDS CXX ${infile}
@@ -171,13 +124,13 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
       )
       set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc)
 
-      add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc)
-      add_dependencies(omptarget-nvptx-bc omptarget-nvptx-${sm}-bc)
+      add_custom_target(bolt-omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc)
+      add_dependencies(bolt-omptarget-nvptx-bc bolt-omptarget-nvptx-${sm}-bc)
 
       # Copy library to destination.
-      add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD
+      add_custom_command(TARGET bolt-omptarget-nvptx-${sm}-bc POST_BUILD
                          COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
-                         $<TARGET_FILE_DIR:omptarget-nvptx>)
+                         ${LIBOMPTARGET_LIBRARY_DIR})
 
       # Install bitcode library under the lib destination folder.
       install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}")
@@ -186,5 +139,5 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
 
   add_subdirectory(test)
 else()
-  libomptarget_say("Not building CUDA offloading device RTL: CUDA tools not found in the system.")
+  libomptarget_say("Not building CUDA offloading device RTL: tools to build bc lib not found in the system.")
 endif()
diff --git a/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt b/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt
index 989a01fe0..4149dfacb 100644
--- a/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt
+++ b/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt
@@ -63,7 +63,7 @@ The (simplified) pseudo code generated by LLVM is as follows:
         b. its lane_id
         c. the offset of the lane_id which hosts a remote ReduceData
                 relative to the current one
-        d. an algorithm version paramter determining which reduction
+        d. an algorithm version parameter determining which reduction
                 algorithm to use.
         This shuffleReduceFn retrieves the remote ReduceData through shuffle
         intrinsics and  reduces, using the algorithm specified by the 4th
diff --git a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
deleted file mode 100644
index 5e936b016..000000000
--- a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ /dev/null
@@ -1,581 +0,0 @@
-//===----- data_sharing.cu - NVPTX OpenMP debug utilities -------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of data sharing environments/
-//
-//===----------------------------------------------------------------------===//
-#include "omptarget-nvptx.h"
-#include "target_impl.h"
-#include <stdio.h>
-
-// Warp ID in the CUDA block
-INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; }
-// Lane ID in the CUDA warp.
-INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }
-
-// Return true if this is the first active thread in the warp.
-INLINE static bool IsWarpMasterActiveThread() {
-  unsigned long long Mask = __kmpc_impl_activemask();
-  unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE);
-  unsigned long long Sh = Mask << ShNum;
-  // Truncate Sh to the 32 lower bits
-  return (unsigned)Sh == 0;
-}
-// Return true if this is the master thread.
-INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
-  return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
-}
-
-/// Return the provided size aligned to the size of a pointer.
-INLINE static size_t AlignVal(size_t Val) {
-  const size_t Align = (size_t)sizeof(void *);
-  if (Val & (Align - 1)) {
-    Val += Align;
-    Val &= ~(Align - 1);
-  }
-  return Val;
-}
-
-#define DSFLAG 0
-#define DSFLAG_INIT 0
-#define DSPRINT(_flag, _str, _args...)                                         \
-  {                                                                            \
-    if (_flag) {                                                               \
-      /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x, _args);*/          \
-    }                                                                          \
-  }
-#define DSPRINT0(_flag, _str)                                                  \
-  {                                                                            \
-    if (_flag) {                                                               \
-      /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x);*/                 \
-    }                                                                          \
-  }
-
-// Initialize the shared data structures. This is expected to be called for the
-// master thread and warp masters. \param RootS: A pointer to the root of the
-// data sharing stack. \param InitialDataSize: The initial size of the data in
-// the slot.
-EXTERN void
-__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
-                                           size_t InitialDataSize) {
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
-  DSPRINT0(DSFLAG_INIT,
-           "Entering __kmpc_initialize_data_sharing_environment\n");
-
-  unsigned WID = getWarpId();
-  DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID);
-
-  omptarget_nvptx_TeamDescr *teamDescr =
-      &omptarget_nvptx_threadPrivateContext->TeamContext();
-  __kmpc_data_sharing_slot *RootS =
-      teamDescr->RootS(WID, IsMasterThread(isSPMDMode()));
-
-  DataSharingState.SlotPtr[WID] = RootS;
-  DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
-
-  // We don't need to initialize the frame and active threads.
-
-  DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", (unsigned)InitialDataSize);
-  DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (unsigned long long)RootS);
-  DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n",
-          (unsigned long long)RootS->DataEnd);
-  DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n",
-          (unsigned long long)RootS->Next);
-  DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n",
-          (unsigned long long)DataSharingState.SlotPtr[WID]);
-  DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n",
-          (unsigned long long)DataSharingState.StackPtr[WID]);
-
-  DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n");
-}
-
-EXTERN void *__kmpc_data_sharing_environment_begin(
-    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
-    void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
-    size_t SharingDataSize, size_t SharingDefaultDataSize,
-    int16_t IsOMPRuntimeInitialized) {
-
-  DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n");
-
-  // If the runtime has been elided, used __shared__ memory for master-worker
-  // data sharing.
-  if (!IsOMPRuntimeInitialized)
-    return (void *)&DataSharingState;
-
-  DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
-  DSPRINT(DSFLAG, "Default Data Size %016llx\n",
-          (unsigned long long)SharingDefaultDataSize);
-
-  unsigned WID = getWarpId();
-  __kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask();
-
-  __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
-  void *&StackP = DataSharingState.StackPtr[WID];
-  void * volatile &FrameP = DataSharingState.FramePtr[WID];
-  __kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID];
-
-  DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
-  // Save the current values.
-  *SavedSharedSlot = SlotP;
-  *SavedSharedStack = StackP;
-  *SavedSharedFrame = FrameP;
-  *SavedActiveThreads = ActiveT;
-
-  DSPRINT(DSFLAG, "Warp ID: %u\n", WID);
-  DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (unsigned long long)SlotP);
-  DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (unsigned long long)StackP);
-  DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP);
-  DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
-
-  // Only the warp active master needs to grow the stack.
-  if (IsWarpMasterActiveThread()) {
-    // Save the current active threads.
-    ActiveT = CurActiveThreads;
-
-    // Make sure we use aligned sizes to avoid rematerialization of data.
-    SharingDataSize = AlignVal(SharingDataSize);
-    // FIXME: The default data size can be assumed to be aligned?
-    SharingDefaultDataSize = AlignVal(SharingDefaultDataSize);
-
-    // Check if we have room for the data in the current slot.
-    const uintptr_t CurrentStartAddress = (uintptr_t)StackP;
-    const uintptr_t CurrentEndAddress = (uintptr_t)SlotP->DataEnd;
-    const uintptr_t RequiredEndAddress =
-        CurrentStartAddress + (uintptr_t)SharingDataSize;
-
-    DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
-    DSPRINT(DSFLAG, "Default Data Size %016llx\n",
-            (unsigned long long)SharingDefaultDataSize);
-    DSPRINT(DSFLAG, "Current Start Address %016llx\n",
-            (unsigned long long)CurrentStartAddress);
-    DSPRINT(DSFLAG, "Current End Address %016llx\n",
-            (unsigned long long)CurrentEndAddress);
-    DSPRINT(DSFLAG, "Required End Address %016llx\n",
-            (unsigned long long)RequiredEndAddress);
-    DSPRINT(DSFLAG, "Active Threads %08x\n", (unsigned)ActiveT);
-
-    // If we require a new slot, allocate it and initialize it (or attempt to
-    // reuse one). Also, set the shared stack and slot pointers to the new
-    // place. If we do not need to grow the stack, just adapt the stack and
-    // frame pointers.
-    if (CurrentEndAddress < RequiredEndAddress) {
-      size_t NewSize = (SharingDataSize > SharingDefaultDataSize)
-                           ? SharingDataSize
-                           : SharingDefaultDataSize;
-      __kmpc_data_sharing_slot *NewSlot = 0;
-
-      // Attempt to reuse an existing slot.
-      if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
-        uintptr_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
-                                     (uintptr_t)(&ExistingSlot->Data[0]);
-        if (ExistingSlotSize >= NewSize) {
-          DSPRINT(DSFLAG, "Reusing stack slot %016llx\n",
-                  (unsigned long long)ExistingSlot);
-          NewSlot = ExistingSlot;
-        } else {
-          DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n",
-                  (unsigned long long)SlotP->Next);
-          free(ExistingSlot);
-        }
-      }
-
-      if (!NewSlot) {
-        NewSlot = (__kmpc_data_sharing_slot *)malloc(
-            sizeof(__kmpc_data_sharing_slot) + NewSize);
-        DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n",
-                (unsigned long long)NewSlot, NewSize);
-      }
-
-      NewSlot->Next = 0;
-      NewSlot->DataEnd = &NewSlot->Data[NewSize];
-
-      SlotP->Next = NewSlot;
-      SlotP = NewSlot;
-      StackP = &NewSlot->Data[SharingDataSize];
-      FrameP = &NewSlot->Data[0];
-    } else {
-
-      // Clean up any old slot that we may still have. The slot producers, do
-      // not eliminate them because that may be used to return data.
-      if (SlotP->Next) {
-        DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n",
-                (unsigned long long)SlotP->Next);
-        free(SlotP->Next);
-        SlotP->Next = 0;
-      }
-
-      FrameP = StackP;
-      StackP = (void *)RequiredEndAddress;
-    }
-  }
-
-  // FIXME: Need to see the impact of doing it here.
-  __threadfence_block();
-
-  DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_begin\n");
-
-  // All the threads in this warp get the frame they should work with.
-  return FrameP;
-}
-
-EXTERN void __kmpc_data_sharing_environment_end(
-    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
-    void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
-    int32_t IsEntryPoint) {
-
-  DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n");
-
-  unsigned WID = getWarpId();
-
-  if (IsEntryPoint) {
-    if (IsWarpMasterActiveThread()) {
-      DSPRINT0(DSFLAG, "Doing clean up\n");
-
-      // The master thread cleans the saved slot, because this is an environment
-      // only for the master.
-      __kmpc_data_sharing_slot *S = IsMasterThread(isSPMDMode())
-                                        ? *SavedSharedSlot
-                                        : DataSharingState.SlotPtr[WID];
-
-      if (S->Next) {
-        free(S->Next);
-        S->Next = 0;
-      }
-    }
-
-    DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n");
-    return;
-  }
-
-  __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
-
-  // Only the warp master can restore the stack and frame information, and only
-  // if there are no other threads left behind in this environment (i.e. the
-  // warp diverged and returns in different places). This only works if we
-  // assume that threads will converge right after the call site that started
-  // the environment.
-  if (IsWarpMasterActiveThread()) {
-    __kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID];
-
-    DSPRINT0(DSFLAG, "Before restoring the stack\n");
-    // Zero the bits in the mask. If it is still different from zero, then we
-    // have other threads that will return after the current ones.
-    ActiveT &= ~CurActive;
-
-    DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n",
-            (unsigned)CurActive, (unsigned)ActiveT);
-
-    if (!ActiveT) {
-      // No other active threads? Great, lets restore the stack.
-
-      __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
-      void *&StackP = DataSharingState.StackPtr[WID];
-      void * volatile &FrameP = DataSharingState.FramePtr[WID];
-
-      SlotP = *SavedSharedSlot;
-      StackP = *SavedSharedStack;
-      FrameP = *SavedSharedFrame;
-      ActiveT = *SavedActiveThreads;
-
-      DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n",
-              (unsigned long long)SlotP);
-      DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n",
-              (unsigned long long)StackP);
-      DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n",
-              (unsigned long long)FrameP);
-      DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
-    }
-  }
-
-  // FIXME: Need to see the impact of doing it here.
-  __threadfence_block();
-
-  DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_end\n");
-  return;
-}
-
-EXTERN void *
-__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
-                                          int16_t IsOMPRuntimeInitialized) {
-  DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n");
-
-  // If the runtime has been elided, use __shared__ memory for master-worker
-  // data sharing.  We're reusing the statically allocated data structure
-  // that is used for standard data sharing.
-  if (!IsOMPRuntimeInitialized)
-    return (void *)&DataSharingState;
-
-  // Get the frame used by the requested thread.
-
-  unsigned SourceWID = SourceThreadID / WARPSIZE;
-
-  DSPRINT(DSFLAG, "Source  warp: %u\n", SourceWID);
-
-  void * volatile P = DataSharingState.FramePtr[SourceWID];
-  DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
-  return P;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Runtime functions for trunk data sharing scheme.
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE static void data_sharing_init_stack_common() {
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
-  omptarget_nvptx_TeamDescr *teamDescr =
-      &omptarget_nvptx_threadPrivateContext->TeamContext();
-
-  for (int WID = 0; WID < WARPSIZE; WID++) {
-    __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
-    DataSharingState.SlotPtr[WID] = RootS;
-    DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
-  }
-}
-
-// Initialize data sharing data structure. This function needs to be called
-// once at the beginning of a data sharing context (coincides with the kernel
-// initialization). This function is called only by the MASTER thread of each
-// team in non-SPMD mode.
-EXTERN void __kmpc_data_sharing_init_stack() {
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
-  // This function initializes the stack pointer with the pointer to the
-  // statically allocated shared memory slots. The size of a shared memory
-  // slot is pre-determined to be 256 bytes.
-  data_sharing_init_stack_common();
-  omptarget_nvptx_globalArgs.Init();
-}
-
-// Initialize data sharing data structure. This function needs to be called
-// once at the beginning of a data sharing context (coincides with the kernel
-// initialization). This function is called in SPMD mode only.
-EXTERN void __kmpc_data_sharing_init_stack_spmd() {
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
-  // This function initializes the stack pointer with the pointer to the
-  // statically allocated shared memory slots. The size of a shared memory
-  // slot is pre-determined to be 256 bytes.
-  if (threadIdx.x == 0)
-    data_sharing_init_stack_common();
-
-  __threadfence_block();
-}
-
-INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
-
-  // Only warp active master threads manage the stack.
-  bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0;
-
-  // Add worst-case padding to DataSize so that future stack allocations are
-  // correctly aligned.
-  const size_t Alignment = 8;
-  PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
-
-  // Frame pointer must be visible to all workers in the same warp.
-  const unsigned WID = getWarpId();
-  void *FrameP = 0;
-  __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
-
-  if (IsWarpMaster) {
-    // SlotP will point to either the shared memory slot or an existing
-    // global memory slot.
-    __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
-    void *&StackP = DataSharingState.StackPtr[WID];
-
-    // Check if we have room for the data in the current slot.
-    const uintptr_t StartAddress = (uintptr_t)StackP;
-    const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
-    const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize;
-
-    // If we requested more data than there is room for in the rest
-    // of the slot then we need to either re-use the next slot, if one exists,
-    // or create a new slot.
-    if (EndAddress < RequestedEndAddress) {
-      __kmpc_data_sharing_slot *NewSlot = 0;
-      size_t NewSize = PushSize;
-
-      // Allocate at least the default size for each type of slot.
-      // Master is a special case and even though there is only one thread,
-      // it can share more things with the workers. For uniformity, it uses
-      // the full size of a worker warp slot.
-      size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
-      if (DefaultSlotSize > NewSize)
-        NewSize = DefaultSlotSize;
-      NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc(
-          sizeof(__kmpc_data_sharing_slot) + NewSize,
-          "Global memory slot allocation.");
-
-      NewSlot->Next = 0;
-      NewSlot->Prev = SlotP;
-      NewSlot->PrevSlotStackPtr = StackP;
-      NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;
-
-      // Make previous slot point to the newly allocated slot.
-      SlotP->Next = NewSlot;
-      // The current slot becomes the new slot.
-      SlotP = NewSlot;
-      // The stack pointer always points to the next free stack frame.
-      StackP = &NewSlot->Data[0] + PushSize;
-      // The frame pointer always points to the beginning of the frame.
-      FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0];
-    } else {
-      // Add the data chunk to the current slot. The frame pointer is set to
-      // point to the start of the new frame held in StackP.
-      FrameP = DataSharingState.FramePtr[WID] = StackP;
-      // Reset stack pointer to the requested address.
-      StackP = (void *)RequestedEndAddress;
-    }
-  }
-  // Get address from lane 0.
-  int *FP = (int *)&FrameP;
-  FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);
-  if (sizeof(FrameP) == 8)
-    FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);
-
-  return FrameP;
-}
-
-EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
-                                                      int16_t UseSharedMemory) {
-  return data_sharing_push_stack_common(DataSize);
-}
-
-// Called at the time of the kernel initialization. This is used to initilize
-// the list of references to shared variables and to pre-allocate global storage
-// for holding the globalized variables.
-//
-// By default the globalized variables are stored in global memory. If the
-// UseSharedMemory is set to true, the runtime will attempt to use shared memory
-// as long as the size requested fits the pre-allocated size.
-EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
-                                            int16_t UseSharedMemory) {
-  // Compute the total memory footprint of the requested data.
-  // The master thread requires a stack only for itself. A worker
-  // thread (which at this point is a warp master) will require
-  // space for the variables of each thread in the warp,
-  // i.e. one DataSize chunk per warp lane.
-  // TODO: change WARPSIZE to the number of active threads in the warp.
-  size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode()))
-                        ? DataSize
-                        : WARPSIZE * DataSize;
-
-  // Compute the start address of the frame of each thread in the warp.
-  uintptr_t FrameStartAddress =
-      (uintptr_t) data_sharing_push_stack_common(PushSize);
-  FrameStartAddress += (uintptr_t) (getLaneId() * DataSize);
-  return (void *)FrameStartAddress;
-}
-
-// Pop the stack and free any memory which can be reclaimed.
-//
-// When the pop operation removes the last global memory slot,
-// reclaim all outstanding global memory slots since it is
-// likely we have reached the end of the kernel.
-EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
-
-  __threadfence_block();
-
-  if (GetThreadIdInBlock() % WARPSIZE == 0) {
-    unsigned WID = getWarpId();
-
-    // Current slot
-    __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
-
-    // Pointer to next available stack.
-    void *&StackP = DataSharingState.StackPtr[WID];
-
-    // Pop the frame.
-    StackP = FrameStart;
-
-    // If the current slot is empty, we need to free the slot after the
-    // pop.
-    bool SlotEmpty = (StackP == &SlotP->Data[0]);
-
-    if (SlotEmpty && SlotP->Prev) {
-      // Before removing the slot we need to reset StackP.
-      StackP = SlotP->PrevSlotStackPtr;
-
-      // Remove the slot.
-      SlotP = SlotP->Prev;
-      SafeFree(SlotP->Next, "Free slot.");
-      SlotP->Next = 0;
-    }
-  }
-}
-
-// Begin a data sharing context. Maintain a list of references to shared
-// variables. This list of references to shared variables will be passed
-// to one or more threads.
-// In L0 data sharing this is called by master thread.
-// In L1 data sharing this is called by active warp master thread.
-EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
-  omptarget_nvptx_globalArgs.EnsureSize(nArgs);
-  *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
-}
-
-// End a data sharing context. There is no need to have a list of refs
-// to shared variables because the context in which those variables were
-// shared has now ended. This should clean-up the list of references only
-// without affecting the actual global storage of the variables.
-// In L0 data sharing this is called by master thread.
-// In L1 data sharing this is called by active warp master thread.
-EXTERN void __kmpc_end_sharing_variables() {
-  omptarget_nvptx_globalArgs.DeInit();
-}
-
-// This function will return a list of references to global variables. This
-// is how the workers will get a reference to the globalized variable. The
-// members of this list will be passed to the outlined parallel function
-// preserving the order.
-// Called by all workers.
-EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
-  *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
-}
-
-// This function is used to init static memory manager. This manager is used to
-// manage statically allocated global memory. This memory is allocated by the
-// compiler and used to correctly implement globalization of the variables in
-// target, teams and distribute regions.
-EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
-                                          const void *buf, size_t size,
-                                          int16_t is_shared,
-                                          const void **frame) {
-  if (is_shared) {
-    *frame = buf;
-    return;
-  }
-  if (isSPMDExecutionMode) {
-    if (GetThreadIdInBlock() == 0) {
-      *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
-    }
-    __kmpc_impl_syncthreads();
-    return;
-  }
-  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
-          "Must be called only in the target master thread.");
-  *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
-  __threadfence();
-}
-
-EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
-                                              int16_t is_shared) {
-  if (is_shared)
-    return;
-  if (isSPMDExecutionMode) {
-    __kmpc_impl_syncthreads();
-    if (GetThreadIdInBlock() == 0) {
-      omptarget_nvptx_simpleMemoryManager.Release();
-    }
-    return;
-  }
-  __threadfence();
-  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
-          "Must be called only in the target master thread.");
-  omptarget_nvptx_simpleMemoryManager.Release();
-}
-
diff --git a/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h b/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
index 7c9e471e4..c5e91c5bf 100644
--- a/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
+++ b/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
@@ -13,5 +13,6 @@
 
 #define EXTERN extern "C" __device__
 typedef uint32_t __kmpc_impl_lanemask_t;
+typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
 
 #endif
diff --git a/libomptarget/deviceRTLs/nvptx/src/option.h b/libomptarget/deviceRTLs/nvptx/src/option.h
deleted file mode 100644
index 3c0beaf62..000000000
--- a/libomptarget/deviceRTLs/nvptx/src/option.h
+++ /dev/null
@@ -1,68 +0,0 @@
-//===------------ option.h - NVPTX OpenMP GPU options ------------ CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// GPU default options
-//
-//===----------------------------------------------------------------------===//
-#ifndef _OPTION_H_
-#define _OPTION_H_
-
-#include "interface.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// Kernel options
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// The following def must match the absolute limit hardwired in the host RTL
-// max number of threads per team
-#define MAX_THREADS_PER_TEAM 1024
-
-#define WARPSIZE 32
-
-// The named barrier for active parallel threads of a team in an L1 parallel
-// region to synchronize with each other.
-#define L1_BARRIER (1)
-
-// Maximum number of preallocated arguments to an outlined parallel/simd function.
-// Anything more requires dynamic memory allocation.
-#define MAX_SHARED_ARGS 20
-
-// Maximum number of omp state objects per SM allocated statically in global
-// memory.
-#if __CUDA_ARCH__ >= 700
-#define OMP_STATE_COUNT 32
-#define MAX_SM 84
-#elif __CUDA_ARCH__ >= 600
-#define OMP_STATE_COUNT 32
-#define MAX_SM 56
-#else
-#define OMP_STATE_COUNT 16
-#define MAX_SM 16
-#endif
-
-#define OMP_ACTIVE_PARALLEL_LEVEL 128
-
-////////////////////////////////////////////////////////////////////////////////
-// algo options
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// misc options (by def everythig here is device)
-////////////////////////////////////////////////////////////////////////////////
-
-#define INLINE __forceinline__ __device__
-#define NOINLINE __noinline__ __device__
-#ifndef TRUE
-#define TRUE 1
-#endif
-#ifndef FALSE
-#define FALSE 0
-#endif
-
-#endif
diff --git a/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
new file mode 100644
index 000000000..75945e3cd
--- /dev/null
+++ b/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -0,0 +1,181 @@
+//===---------- target_impl.cu - NVPTX OpenMP GPU options ------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definitions of target specific functions
+//
+//===----------------------------------------------------------------------===//
+#pragma omp declare target
+
+#include "target_impl.h"
+#include "common/debug.h"
+
+#include <cuda.h>
+
+// Forward declaration of CUDA primitives which will be evetually transformed
+// into LLVM intrinsics.
+extern "C" {
+unsigned int __activemask();
+unsigned int __ballot(unsigned);
+// The default argument here is based on NVIDIA's website
+// https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
+int __shfl_sync(unsigned mask, int val, int src_line, int width = WARPSIZE);
+int __shfl(int val, int src_line, int width = WARPSIZE);
+int __shfl_down(int var, unsigned detla, int width);
+int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width);
+void __syncwarp(int mask);
+void __threadfence();
+void __threadfence_block();
+void __threadfence_system();
+}
+
+DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
+}
+
+DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
+  uint64_t val;
+  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
+  return val;
+}
+
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
+  __kmpc_impl_lanemask_t res;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
+  return res;
+}
+
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
+  __kmpc_impl_lanemask_t res;
+  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
+  return res;
+}
+
+DEVICE uint32_t __kmpc_impl_smid() {
+  uint32_t id;
+  asm("mov.u32 %0, %%smid;" : "=r"(id));
+  return id;
+}
+
+DEVICE double __kmpc_impl_get_wtick() {
+  // Timer precision is 1ns
+  return ((double)1E-9);
+}
+
+DEVICE double __kmpc_impl_get_wtime() {
+  unsigned long long nsecs;
+  asm("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
+  return (double)nsecs * __kmpc_impl_get_wtick();
+}
+
+// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
+#if CUDA_VERSION >= 9000
+  return __activemask();
+#else
+  return __ballot(1);
+#endif
+}
+
+// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
+                                     int32_t SrcLane) {
+#if CUDA_VERSION >= 9000
+  return __shfl_sync(Mask, Var, SrcLane);
+#else
+  return __shfl(Var, SrcLane);
+#endif // CUDA_VERSION
+}
+
+DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
+                                          int32_t Var, uint32_t Delta,
+                                          int32_t Width) {
+#if CUDA_VERSION >= 9000
+  return __shfl_down_sync(Mask, Var, Delta, Width);
+#else
+  return __shfl_down(Var, Delta, Width);
+#endif // CUDA_VERSION
+}
+
+DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
+
+DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
+#if CUDA_VERSION >= 9000
+  __syncwarp(Mask);
+#else
+  // In Cuda < 9.0 no need to sync threads in warps.
+#endif // CUDA_VERSION
+}
+
+// NVPTX specific kernel initialization
+DEVICE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
+}
+
+// Barrier until num_threads arrive.
+DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
+  // The named barrier for active parallel threads of a team in an L1 parallel
+  // region to synchronize with each other.
+  int barrier = 1;
+  asm volatile("bar.sync %0, %1;"
+               :
+               : "r"(barrier), "r"(num_threads)
+               : "memory");
+}
+
+DEVICE void __kmpc_impl_threadfence() { __threadfence(); }
+DEVICE void __kmpc_impl_threadfence_block() { __threadfence_block(); }
+DEVICE void __kmpc_impl_threadfence_system() { __threadfence_system(); }
+
+// Calls to the NVPTX layer (assuming 1D layout)
+DEVICE int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
+DEVICE int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); }
+DEVICE int GetNumberOfBlocksInKernel() {
+  return __nvvm_read_ptx_sreg_nctaid_x();
+}
+DEVICE int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); }
+DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
+DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
+
+#define __OMP_SPIN 1000
+#define UNSET 0u
+#define SET 1u
+
+DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock) {
+  __kmpc_impl_unset_lock(lock);
+}
+
+DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
+  __kmpc_impl_unset_lock(lock);
+}
+
+DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) {
+  // TODO: not sure spinning is a good idea here..
+  while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
+    int32_t start = __nvvm_read_ptx_sreg_clock();
+    int32_t now;
+    for (;;) {
+      now = __nvvm_read_ptx_sreg_clock();
+      int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
+      if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
+        break;
+      }
+    }
+  } // wait for 0 to be the read value
+}
+
+DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock) {
+  (void)__kmpc_atomic_exchange(lock, UNSET);
+}
+
+DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) {
+  return __kmpc_atomic_add(lock, 0u);
+}
+
+DEVICE void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
+DEVICE void __kmpc_impl_free(void *x) { free(x); }
+
+#pragma omp end declare target
diff --git a/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/libomptarget/deviceRTLs/nvptx/src/target_impl.h
index de2776e91..1d7b649fe 100644
--- a/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -12,89 +12,163 @@
 #ifndef _TARGET_IMPL_H_
 #define _TARGET_IMPL_H_
 
-#include <stdint.h>
-
-#include "option.h"
-
-INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
-}
-
-INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
-  uint64_t val;
-  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
-  return val;
-}
-
-static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes =
-    UINT32_C(0xffffffff);
-
-INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
-  __kmpc_impl_lanemask_t res;
-  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
-  return res;
-}
-
-INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
-  __kmpc_impl_lanemask_t res;
-  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
-  return res;
-}
+#include <assert.h>
+#include <cuda.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "nvptx_interface.h"
+
+#define DEVICE __device__
+#define INLINE __forceinline__ DEVICE
+#define NOINLINE __noinline__ DEVICE
+#define SHARED __shared__
+#define ALIGN(N) __align__(N)
+
+////////////////////////////////////////////////////////////////////////////////
+// Kernel options
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// The following def must match the absolute limit hardwired in the host RTL
+// max number of threads per team
+#define MAX_THREADS_PER_TEAM 1024
+
+#define WARPSIZE 32
+
+// Maximum number of preallocated arguments to an outlined parallel/simd function.
+// Anything more requires dynamic memory allocation.
+#define MAX_SHARED_ARGS 20
+
+// Maximum number of omp state objects per SM allocated statically in global
+// memory.
+#if __CUDA_ARCH__ >= 600
+#define OMP_STATE_COUNT 32
+#else
+#define OMP_STATE_COUNT 16
+#endif
 
-INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
+#if !defined(MAX_SM)
+#if __CUDA_ARCH__ >= 900
+#error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option
+#elif __CUDA_ARCH__ >= 800
+// GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs
+// GA102 design has a maxinum of 84 SMs
+#define MAX_SM 108
+#elif __CUDA_ARCH__ >= 700
+#define MAX_SM 84
+#elif __CUDA_ARCH__ >= 600
+#define MAX_SM 56
+#else
+#define MAX_SM 16
+#endif
+#endif
 
-INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }
+#define OMP_ACTIVE_PARALLEL_LEVEL 128
+
+// Data sharing related quantities, need to match what is used in the compiler.
+enum DATA_SHARING_SIZES {
+  // The maximum number of workers in a kernel.
+  DS_Max_Worker_Threads = 992,
+  // The size reserved for data in a shared memory slot.
+  DS_Slot_Size = 256,
+  // The slot size that should be reserved for a working warp.
+  DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
+  // The maximum number of warps in use
+  DS_Max_Warp_Number = 32,
+  // The size of the preallocated shared memory buffer per team
+  DS_Shared_Memory_Size = 128,
+};
+
+enum : __kmpc_impl_lanemask_t {
+  __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
+};
+
+DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
+DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt();
+DEVICE uint32_t __kmpc_impl_smid();
+DEVICE double __kmpc_impl_get_wtick();
+DEVICE double __kmpc_impl_get_wtime();
+
+INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); }
+INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); }
 
 #ifndef CUDA_VERSION
 #error CUDA_VERSION macro is undefined, something wrong with cuda.
 #endif
 
-// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();
 
-INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
-#if CUDA_VERSION >= 9000
-  return __activemask();
-#else
-  return __ballot(1);
-#endif
-}
+DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
+                                     int32_t SrcLane);
 
-// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
+                                          int32_t Var, uint32_t Delta,
+                                          int32_t Width);
+
+DEVICE void __kmpc_impl_syncthreads();
+DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);
+
+// NVPTX specific kernel initialization
+DEVICE void __kmpc_impl_target_init();
+
+// Barrier until num_threads arrive.
+DEVICE void __kmpc_impl_named_sync(uint32_t num_threads);
+
+DEVICE void __kmpc_impl_threadfence();
+DEVICE void __kmpc_impl_threadfence_block();
+DEVICE void __kmpc_impl_threadfence_system();
+
+// Calls to the NVPTX layer (assuming 1D layout)
+DEVICE int GetThreadIdInBlock();
+DEVICE int GetBlockIdInKernel();
+DEVICE int GetNumberOfBlocksInKernel();
+DEVICE int GetNumberOfThreadsInBlock();
+DEVICE unsigned GetWarpId();
+DEVICE unsigned GetLaneId();
+
+// Forward declaration of atomics. Although they're template functions, we
+// already have definitions for different types in CUDA internal headers with
+// the right mangled names.
+template <typename T> DEVICE T atomicAdd(T *address, T val);
+template <typename T> DEVICE T atomicInc(T *address, T val);
+template <typename T> DEVICE T atomicMax(T *address, T val);
+template <typename T> DEVICE T atomicExch(T *address, T val);
+template <typename T> DEVICE T atomicCAS(T *address, T compare, T val);
+
+// Atomics
+template <typename T> INLINE T __kmpc_atomic_add(T *address, T val) {
+  return atomicAdd(address, val);
+}
 
-INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
-                                     int32_t SrcLane) {
-#if CUDA_VERSION >= 9000
-  return __shfl_sync(Mask, Var, SrcLane);
-#else
-  return __shfl(Var, SrcLane);
-#endif // CUDA_VERSION
+template <typename T> INLINE T __kmpc_atomic_inc(T *address, T val) {
+  return atomicInc(address, val);
 }
 
-INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
-                                          int32_t Var, uint32_t Delta,
-                                          int32_t Width) {
-#if CUDA_VERSION >= 9000
-  return __shfl_down_sync(Mask, Var, Delta, Width);
-#else
-  return __shfl_down(Var, Delta, Width);
-#endif // CUDA_VERSION
+template <typename T> INLINE T __kmpc_atomic_max(T *address, T val) {
+  return atomicMax(address, val);
 }
 
-INLINE void __kmpc_impl_syncthreads() {
-  // Use original __syncthreads if compiled by nvcc or clang >= 9.0.
-#if !defined(__clang__) || __clang_major__ >= 9
-  __syncthreads();
-#else
-  asm volatile("bar.sync %0;" : : "r"(0) : "memory");
-#endif // __clang__
+template <typename T> INLINE T __kmpc_atomic_exchange(T *address, T val) {
+  return atomicExch(address, val);
 }
 
-INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
-#if CUDA_VERSION >= 9000
-  __syncwarp(Mask);
-#else
-  // In Cuda < 9.0 no need to sync threads in warps.
-#endif // CUDA_VERSION
+template <typename T> INLINE T __kmpc_atomic_cas(T *address, T compare, T val) {
+  return atomicCAS(address, compare, val);
 }
 
+// Locks
+DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);
+DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock);
+DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock);
+DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock);
+DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock);
+
+// Memory
+DEVICE void *__kmpc_impl_malloc(size_t);
+DEVICE void __kmpc_impl_free(void *);
+
 #endif
diff --git a/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt b/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt
index 33945d167..71529662b 100644
--- a/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt
+++ b/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt
@@ -3,16 +3,15 @@ if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang")
   return()
 endif()
 
-set(deps omptarget-nvptx omptarget omp)
+set(deps bolt-omptarget bolt-omp)
 if(LIBOMPTARGET_NVPTX_ENABLE_BCLIB)
-  set(deps ${deps} omptarget-nvptx-bc)
+  set(deps ${deps} bolt-omptarget-nvptx-bc)
 endif()
 
-# Don't run by default.
-set(EXCLUDE_FROM_ALL True)
 # Run with only one thread to only launch one application to the GPU at a time.
-add_openmp_testsuite(check-libomptarget-nvptx
+add_openmp_testsuite(check-bolt-libomptarget-nvptx
     "Running libomptarget-nvptx tests" ${CMAKE_CURRENT_BINARY_DIR}
+    EXCLUDE_FROM_CHECK_ALL
     DEPENDS ${deps} ARGS -j1)
 
 set(LIBOMPTARGET_NVPTX_TEST_FLAGS "" CACHE STRING
diff --git a/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c b/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c
new file mode 100644
index 000000000..60254bc7e
--- /dev/null
+++ b/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c
@@ -0,0 +1,22 @@
+// RUN: %compile-run-and-check
+#include <omp.h>
+#include <stdio.h>
+
+int main(){
+  int max_threads = -1;
+  int num_threads = -1;
+
+  #pragma omp target map(tofrom: max_threads)
+    max_threads = omp_get_max_threads();
+
+  #pragma omp target parallel map(tofrom: num_threads)
+  {
+    #pragma omp master
+      num_threads = omp_get_num_threads();
+  }
+  
+  // CHECK: Max Threads: 128, Num Threads: 128
+  printf("Max Threads: %d, Num Threads: %d\n", max_threads, num_threads);
+
+  return 0;
+}
diff --git a/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c b/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c
index d0d9f3151..efb418fef 100644
--- a/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c
+++ b/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c
@@ -19,7 +19,14 @@ int main(int argc, char *argv[]) {
     { MaxThreadsL2 = omp_get_max_threads(); }
   }
 
-  // CHECK: Non-SPMD MaxThreadsL1 = 32
+  //FIXME: This Non-SPMD kernel will have 32 active threads due to
+  //       thread_limit. However, Non-SPMD MaxThreadsL1 is the total number of
+  //       threads in block (64 in this case), which translates to worker
+  //       threads + WARP_SIZE for Non-SPMD kernels and worker threads for SPMD
+  //       kernels. According to the spec, omp_get_max_threads must return the
+  //       max active threads possible between the two kernel types.
+
+  // CHECK: Non-SPMD MaxThreadsL1 = 64
   printf("Non-SPMD MaxThreadsL1 = %d\n", MaxThreadsL1);
   // CHECK: Non-SPMD MaxThreadsL2 = 1
   printf("Non-SPMD MaxThreadsL2 = %d\n", MaxThreadsL2);
diff --git a/libomptarget/deviceRTLs/nvptx/test/lit.cfg b/libomptarget/deviceRTLs/nvptx/test/lit.cfg
index 0774c25af..6a42fba05 100644
--- a/libomptarget/deviceRTLs/nvptx/test/lit.cfg
+++ b/libomptarget/deviceRTLs/nvptx/test/lit.cfg
@@ -16,7 +16,7 @@ def prepend_library_path(name, value, sep):
         config.environment[name] = value
 
 # name: The name of this test suite.
-config.name = 'libomptarget-nvptx'
+config.name = 'bolt-libomptarget-nvptx'
 
 # suffixes: A list of file extensions to treat as test files.
 config.suffixes = ['.c', '.cpp', '.cc']
@@ -45,6 +45,12 @@ config.test_flags = config.test_flags + " " + config.test_extra_flags
 prepend_library_path('LD_LIBRARY_PATH', config.library_dir, ":")
 prepend_library_path('LD_LIBRARY_PATH', config.omp_host_rtl_directory, ":")
 
+# Setup flags for BOLT.  If BOLT is not used, they are ignored.
+# Some tasking tests require larger stack size.
+config.environment['ABT_THREAD_STACKSIZE'] = "262144"
+# Sleep alleviates oversubscription overheads when -j is specified.
+config.environment['KMP_ABT_SCHED_SLEEP'] = "1"
+
 # Forbid fallback to host.
 config.environment["OMP_TARGET_OFFLOAD"] = "MANDATORY"
 
@@ -67,3 +73,4 @@ config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
 config.substitutions.append(("%flags", config.test_flags))
 
 config.substitutions.append(("%run", "%t"))
+config.substitutions.append(("%not", config.libomptarget_not))
diff --git a/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in b/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in
index d9c14cbc5..f0e02e5d4 100644
--- a/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in
+++ b/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in
@@ -9,6 +9,7 @@ config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@"
 config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@"
 config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@"
 config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
+config.libomptarget_not = "@OPENMP_NOT_EXECUTABLE@"
 
 # Let the main config do the real work.
 lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")
diff --git a/libomptarget/deviceRTLs/nvptx/test/parallel/level.c b/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
index 33e232464..0a137530c 100644
--- a/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
+++ b/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
@@ -53,7 +53,7 @@ int main(int argc, char *argv[]) {
       check2[id] += omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0);
       // Expected to return the current thread num.
       check2[id] += (omp_get_ancestor_thread_num(1) - id);
-      // Exepcted to return the current number of threads.
+      // Expected to return the current number of threads.
       check2[id] += 3 * omp_get_team_size(1);
       // Expected to return -1, see above.
       check2[id] += omp_get_ancestor_thread_num(2) + omp_get_team_size(2);
@@ -68,9 +68,9 @@ int main(int argc, char *argv[]) {
         int check4Inc = omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0);
         // Expected to return the parent thread num.
         check4Inc += (omp_get_ancestor_thread_num(1) - id);
-        // Exepcted to return the number of threads in the active parallel region.
+        // Expected to return the number of threads in the active parallel region.
         check4Inc += 3 * omp_get_team_size(1);
-        // Exptected to return 0 and 1.
+        // Expected to return 0 and 1.
         check4Inc += omp_get_ancestor_thread_num(2) + 3 * omp_get_team_size(2);
         // Expected to return -1, see above.
         check4Inc += omp_get_ancestor_thread_num(3) + omp_get_team_size(3);
diff --git a/libomptarget/include/Debug.h b/libomptarget/include/Debug.h
new file mode 100644
index 000000000..a2bb419f0
--- /dev/null
+++ b/libomptarget/include/Debug.h
@@ -0,0 +1,175 @@
+//===------- Debug.h - Target independent OpenMP target RTL -- C++ --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Routines used to provide debug messages and information from libomptarget
+// and plugin RTLs to the user.
+//
+// Each plugin RTL and libomptarget define TARGET_NAME and DEBUG_PREFIX for use
+// when sending messages to the user. These indicate which RTL sent the message
+//
+// Debug and information messages are controlled by the environment variables
+// LIBOMPTARGET_DEBUG and LIBOMPTARGET_INFO which is set upon initialization
+// of libomptarget or the plugin RTL. 
+//
+// To printf a pointer in hex with a fixed width of 16 digits and a leading 0x,
+// use printf("ptr=" DPxMOD "...\n", DPxPTR(ptr));
+// 
+// DPxMOD expands to:
+//   "0x%0*" PRIxPTR
+// where PRIxPTR expands to an appropriate modifier for the type uintptr_t on a
+// specific platform, e.g. "lu" if uintptr_t is typedef'd as unsigned long:
+//   "0x%0*lu"
+// 
+// Ultimately, the whole statement expands to:
+//   printf("ptr=0x%0*lu...\n",  // the 0* modifier expects an extra argument
+//                               // specifying the width of the output
+//   (int)(2*sizeof(uintptr_t)), // the extra argument specifying the width
+//                               // 8 digits for 32bit systems
+//                               // 16 digits for 64bit
+//   (uintptr_t) ptr);
+//
+//===----------------------------------------------------------------------===//
+#ifndef _OMPTARGET_DEBUG_H
+#define _OMPTARGET_DEBUG_H
+
+#include <mutex>
+
+/// 32-Bit field data attributes controlling information presented to the user.
+enum OpenMPInfoType : uint32_t {
+  // Print data arguments and attributes upon entering an OpenMP device kernel.
+  OMP_INFOTYPE_KERNEL_ARGS = 0x0001,
+  // Indicate when an address already exists in the device mapping table.
+  OMP_INFOTYPE_MAPPING_EXISTS = 0x0002,
+  // Dump the contents of the device pointer map at kernel exit or failure.
+  OMP_INFOTYPE_DUMP_TABLE = 0x0004,
+  // Print kernel information from target device plugins.
+  OMP_INFOTYPE_PLUGIN_KERNEL = 0x0010,
+  // Enable every flag.
+  OMP_INFOTYPE_ALL = 0xffffffff,
+};
+
+static inline uint32_t getInfoLevel() {
+  static uint32_t InfoLevel = 0;
+  static std::once_flag Flag{};
+  std::call_once(Flag, []() {
+    if (char *EnvStr = getenv("LIBOMPTARGET_INFO"))
+      InfoLevel = std::stoi(EnvStr);
+  });
+
+  return InfoLevel;
+}
+
+static inline uint32_t getDebugLevel() {
+  static uint32_t DebugLevel = 0;
+  static std::once_flag Flag{};
+  std::call_once(Flag, []() {
+    if (char *EnvStr = getenv("LIBOMPTARGET_DEBUG"))
+      DebugLevel = std::stoi(EnvStr);
+  });
+
+  return DebugLevel;
+}
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+#include <inttypes.h>
+#undef __STDC_FORMAT_MACROS
+
+#define DPxMOD "0x%0*" PRIxPTR
+#define DPxPTR(ptr) ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(ptr))
+#define GETNAME2(name) #name
+#define GETNAME(name) GETNAME2(name)
+
+/// Print a generic message string from libomptarget or a plugin RTL
+#define MESSAGE0(_str)                                                         \
+  do {                                                                         \
+    fprintf(stderr, GETNAME(TARGET_NAME) " message: %s\n", _str);              \
+  } while (0)
+
+/// Print a printf formatting string message from libomptarget or a plugin RTL
+#define MESSAGE(_str, ...)                                                     \
+  do {                                                                         \
+    fprintf(stderr, GETNAME(TARGET_NAME) " message: " _str "\n", __VA_ARGS__); \
+  } while (0)
+
+/// Print fatal error message with an error string and error identifier
+#define FATAL_MESSAGE0(_num, _str)                                             \
+  do {                                                                         \
+    fprintf(stderr, GETNAME(TARGET_NAME) " fatal error %d: %s\n", _num, _str); \
+    abort();                                                                   \
+  } while (0)
+
+/// Print fatal error message with a printf string and error identifier
+#define FATAL_MESSAGE(_num, _str, ...)                                         \
+  do {                                                                         \
+    fprintf(stderr, GETNAME(TARGET_NAME) " fatal error %d:" _str "\n", _num,   \
+            __VA_ARGS__);                                                      \
+    abort();                                                                   \
+  } while (0)
+
+/// Print a generic error string from libomptarget or a plugin RTL
+#define FAILURE_MESSAGE(...)                                                   \
+  do {                                                                         \
+    fprintf(stderr, GETNAME(TARGET_NAME) " error: ");                          \
+    fprintf(stderr, __VA_ARGS__);                                              \
+  } while (0)
+
+/// Print a generic information string used if LIBOMPTARGET_INFO=1
+#define INFO_MESSAGE(_num, ...)                                                \
+  do {                                                                         \
+    fprintf(stderr, GETNAME(TARGET_NAME) " device %d info: ", (int)_num);      \
+    fprintf(stderr, __VA_ARGS__);                                              \
+  } while (0)
+
+// Debugging messages
+#ifdef OMPTARGET_DEBUG
+#include <stdio.h>
+
+#define DEBUGP(prefix, ...)                                                    \
+  {                                                                            \
+    fprintf(stderr, "%s --> ", prefix);                                        \
+    fprintf(stderr, __VA_ARGS__);                                              \
+  }
+
+/// Emit a message for debugging
+#define DP(...)                                                                \
+  do {                                                                         \
+    if (getDebugLevel() > 0) {                                                 \
+      DEBUGP(DEBUG_PREFIX, __VA_ARGS__);                                       \
+    }                                                                          \
+  } while (false)
+
+/// Emit a message for debugging or failure if debugging is disabled
+#define REPORT(...)                                                            \
+  do {                                                                         \
+    if (getDebugLevel() > 0) {                                                 \
+      DP(__VA_ARGS__);                                                         \
+    } else {                                                                   \
+      FAILURE_MESSAGE(__VA_ARGS__);                                            \
+    }                                                                          \
+  } while (false)
+#else
+#define DEBUGP(prefix, ...)                                                    \
+  {}
+#define DP(...)                                                                \
+  {}
+#define REPORT(...) FAILURE_MESSAGE(__VA_ARGS__);
+#endif // OMPTARGET_DEBUG
+
+/// Emit a message giving the user extra information about the runtime if
+#define INFO(_flags, _id, ...)                                                 \
+  do {                                                                         \
+    if (getDebugLevel() > 0) {                                                 \
+      DEBUGP(DEBUG_PREFIX, __VA_ARGS__);                                       \
+    } else if (getInfoLevel() & _flags) {                                      \
+      INFO_MESSAGE(_id, __VA_ARGS__);                                          \
+    }                                                                          \
+  } while (false)
+
+#endif // _OMPTARGET_DEBUG_H
diff --git a/libomptarget/include/SourceInfo.h b/libomptarget/include/SourceInfo.h
new file mode 100644
index 000000000..c659d9168
--- /dev/null
+++ b/libomptarget/include/SourceInfo.h
@@ -0,0 +1,110 @@
+//===------- SourceInfo.h - Target independent OpenMP target RTL -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Methods used to describe source information in target regions
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _SOURCE_INFO_H_
+#define _SOURCE_INFO_H_
+
+#include <string>
+
+#ifdef _WIN32
+static const bool OS_WINDOWS = true;
+#else
+static const bool OS_WINDOWS = false;
+#endif
+
+/// Type alias for source location information for variable mappings with
+/// data layout ";name;filename;row;col;;\0" from clang.
+using map_var_info_t = void *;
+
+/// The ident structure that describes a source location from kmp.h. with
+/// source location string data as ";filename;function;line;column;;\0".
+struct ident_t {
+  // Ident_t flags described in kmp.h.
+  int32_t reserved_1;
+  int32_t flags;
+  int32_t reserved_2;
+  int32_t reserved_3;
+  char const *psource;
+};
+
+/// Struct to hold source individual location information.
+class SourceInfo {
+  /// Underlying string copy of the original source information.
+  const std::string sourceStr;
+
+  /// Location fields extracted from the source information string.
+  const std::string name;
+  const std::string filename;
+  const int32_t line;
+  const int32_t column;
+
+  std::string initStr(const void *name) {
+    if (!name)
+      return ";unknown;unknown;0;0;;";
+    else
+      return std::string(reinterpret_cast<const char *>(name));
+  }
+
+  std::string initStr(const ident_t *loc) {
+    if (!loc)
+      return ";unknown;unknown;0;0;;";
+    else
+      return std::string(reinterpret_cast<const char *>(loc->psource));
+  }
+
+  /// Get n-th substring in an expression separated by ;.
+  std::string getSubstring(const int n) const {
+    std::size_t begin = sourceStr.find(';');
+    std::size_t end = sourceStr.find(';', begin + 1);
+    for (int i = 0; i < n; i++) {
+      begin = end;
+      end = sourceStr.find(';', begin + 1);
+    }
+    return sourceStr.substr(begin + 1, end - begin - 1);
+  };
+
+  /// Get the filename from a full path.
+  std::string removePath(const std::string &path) const {
+    std::size_t pos = (OS_WINDOWS) ? path.rfind('\\') : path.rfind('/');
+    return path.substr(pos + 1);
+  };
+
+public:
+  SourceInfo(const ident_t *loc)
+      : sourceStr(initStr(loc)), name(getSubstring(1)),
+        filename(removePath(getSubstring(0))), line(std::stoi(getSubstring(2))),
+        column(std::stoi(getSubstring(3))) {}
+
+  SourceInfo(const map_var_info_t name)
+      : sourceStr(initStr(name)), name(getSubstring(0)),
+        filename(removePath(getSubstring(1))), line(std::stoi(getSubstring(2))),
+        column(std::stoi(getSubstring(3))) {}
+
+  const char *getName() const { return name.c_str(); }
+  const char *getFilename() const { return filename.c_str(); }
+  int32_t getLine() const { return line; }
+  int32_t getColumn() const { return column; }
+  bool isAvailible() const { return (line || column); }
+};
+
+/// Standalone function for getting the variable name of a mapping.
+static inline std::string getNameFromMapping(const map_var_info_t name) {
+  if (!name)
+    return "unknown";
+
+  const std::string name_str(reinterpret_cast<const char *>(name));
+  std::size_t begin = name_str.find(';');
+  std::size_t end = name_str.find(';', begin + 1);
+  return name_str.substr(begin + 1, end - begin - 1);
+}
+
+#endif
diff --git a/libomptarget/include/omptarget.h b/libomptarget/include/omptarget.h
index 826d8ed19..9c533944d 100644
--- a/libomptarget/include/omptarget.h
+++ b/libomptarget/include/omptarget.h
@@ -17,11 +17,12 @@
 #include <stdint.h>
 #include <stddef.h>
 
+#include <SourceInfo.h>
+
 #define OFFLOAD_SUCCESS (0)
 #define OFFLOAD_FAIL (~0)
 
 #define OFFLOAD_DEVICE_DEFAULT     -1
-#define HOST_DEVICE                -10
 
 /// Data attributes for each data reference used in an OpenMP target region.
 enum tgt_map_type {
@@ -49,6 +50,10 @@ enum tgt_map_type {
   OMP_TGT_MAPTYPE_IMPLICIT        = 0x200,
   // copy data to device
   OMP_TGT_MAPTYPE_CLOSE           = 0x400,
+  // runtime error if not already allocated
+  OMP_TGT_MAPTYPE_PRESENT         = 0x1000,
+  // descriptor for non-contiguous target-update
+  OMP_TGT_MAPTYPE_NON_CONTIG      = 0x100000000000,
   // member of struct, member given by [16 MSBs] - 1
   OMP_TGT_MAPTYPE_MEMBER_OF       = 0xffff000000000000
 };
@@ -111,6 +116,22 @@ struct __tgt_target_table {
       *EntriesEnd; // End of the table with all the entries (non inclusive)
 };
 
+/// This struct contains information exchanged between different asynchronous
+/// operations for device-dependent optimization and potential synchronization
+struct __tgt_async_info {
+  // A pointer to a queue-like structure where offloading operations are issued.
+  // We assume to use this structure to do synchronization. In CUDA backend, it
+  // is CUstream.
+  void *Queue = nullptr;
+};
+
+/// This struct is a record of non-contiguous information
+struct __tgt_target_non_contig {
+  uint64_t Offset;
+  uint64_t Count;
+  uint64_t Stride;
+};
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -151,6 +172,17 @@ void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num,
                                     int32_t depNum, void *depList,
                                     int32_t noAliasDepNum,
                                     void *noAliasDepList);
+void __tgt_target_data_begin_mapper(ident_t *loc, int64_t device_id,
+                                    int32_t arg_num, void **args_base,
+                                    void **args, int64_t *arg_sizes,
+                                    int64_t *arg_types,
+                                    map_var_info_t *arg_names,
+                                    void **arg_mappers);
+void __tgt_target_data_begin_nowait_mapper(
+    ident_t *loc, int64_t device_id, int32_t arg_num, void **args_base,
+    void **args, int64_t *arg_sizes, int64_t *arg_types,
+    map_var_info_t *arg_names, void **arg_mappers, int32_t depNum,
+    void *depList, int32_t noAliasDepNum, void *noAliasDepList);
 
 // passes data from the target, release target memory and destroys the
 // host-target mapping (top entry from the stack of data maps) created by
@@ -162,6 +194,16 @@ void __tgt_target_data_end_nowait(int64_t device_id, int32_t arg_num,
                                   int64_t *arg_sizes, int64_t *arg_types,
                                   int32_t depNum, void *depList,
                                   int32_t noAliasDepNum, void *noAliasDepList);
+void __tgt_target_data_end_mapper(ident_t *loc, int64_t device_id,
+                                  int32_t arg_num, void **args_base,
+                                  void **args, int64_t *arg_sizes,
+                                  int64_t *arg_types, map_var_info_t *arg_names,
+                                  void **arg_mappers);
+void __tgt_target_data_end_nowait_mapper(
+    ident_t *loc, int64_t device_id, int32_t arg_num, void **args_base,
+    void **args, int64_t *arg_sizes, int64_t *arg_types,
+    map_var_info_t *arg_names, void **arg_mappers, int32_t depNum,
+    void *depList, int32_t noAliasDepNum, void *noAliasDepList);
 
 /// passes data to/from the target
 void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
@@ -173,6 +215,17 @@ void __tgt_target_data_update_nowait(int64_t device_id, int32_t arg_num,
                                      int32_t depNum, void *depList,
                                      int32_t noAliasDepNum,
                                      void *noAliasDepList);
+void __tgt_target_data_update_mapper(ident_t *loc, int64_t device_id,
+                                     int32_t arg_num, void **args_base,
+                                     void **args, int64_t *arg_sizes,
+                                     int64_t *arg_types,
+                                     map_var_info_t *arg_names,
+                                     void **arg_mappers);
+void __tgt_target_data_update_nowait_mapper(
+    ident_t *loc, int64_t device_id, int32_t arg_num, void **args_base,
+    void **args, int64_t *arg_sizes, int64_t *arg_types,
+    map_var_info_t *arg_names, void **arg_mappers, int32_t depNum,
+    void *depList, int32_t noAliasDepNum, void *noAliasDepList);
 
 // Performs the same actions as data_begin in case arg_num is non-zero
 // and initiates run of offloaded region on target platform; if arg_num
@@ -187,6 +240,16 @@ int __tgt_target_nowait(int64_t device_id, void *host_ptr, int32_t arg_num,
                         void **args_base, void **args, int64_t *arg_sizes,
                         int64_t *arg_types, int32_t depNum, void *depList,
                         int32_t noAliasDepNum, void *noAliasDepList);
+int __tgt_target_mapper(ident_t *loc, int64_t device_id, void *host_ptr,
+                        int32_t arg_num, void **args_base, void **args,
+                        int64_t *arg_sizes, int64_t *arg_types,
+                        map_var_info_t *arg_names, void **arg_mappers);
+int __tgt_target_nowait_mapper(ident_t *loc, int64_t device_id, void *host_ptr,
+                               int32_t arg_num, void **args_base, void **args,
+                               int64_t *arg_sizes, int64_t *arg_types,
+                               map_var_info_t *arg_names, void **arg_mappers,
+                               int32_t depNum, void *depList,
+                               int32_t noAliasDepNum, void *noAliasDepList);
 
 int __tgt_target_teams(int64_t device_id, void *host_ptr, int32_t arg_num,
                        void **args_base, void **args, int64_t *arg_sizes,
@@ -198,51 +261,25 @@ int __tgt_target_teams_nowait(int64_t device_id, void *host_ptr,
                               int32_t num_teams, int32_t thread_limit,
                               int32_t depNum, void *depList,
                               int32_t noAliasDepNum, void *noAliasDepList);
-void __kmpc_push_target_tripcount(int64_t device_id, uint64_t loop_tripcount);
+int __tgt_target_teams_mapper(ident_t *loc, int64_t device_id, void *host_ptr,
+                              int32_t arg_num, void **args_base, void **args,
+                              int64_t *arg_sizes, int64_t *arg_types,
+                              map_var_info_t *arg_names, void **arg_mappers,
+                              int32_t num_teams, int32_t thread_limit);
+int __tgt_target_teams_nowait_mapper(
+    ident_t *loc, int64_t device_id, void *host_ptr, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
+    map_var_info_t *arg_names, void **arg_mappers, int32_t num_teams,
+    int32_t thread_limit, int32_t depNum, void *depList, int32_t noAliasDepNum,
+    void *noAliasDepList);
+
+void __kmpc_push_target_tripcount(ident_t *loc, int64_t device_id,
+                                  uint64_t loop_tripcount);
 
 #ifdef __cplusplus
 }
 #endif
 
-#ifdef OMPTARGET_DEBUG
-#include <stdio.h>
-#define DEBUGP(prefix, ...)                                                    \
-  {                                                                            \
-    fprintf(stderr, "%s --> ", prefix);                                        \
-    fprintf(stderr, __VA_ARGS__);                                              \
-  }
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
-#define DPxMOD "0x%0*" PRIxPTR
-#define DPxPTR(ptr) ((int)(2*sizeof(uintptr_t))), ((uintptr_t) (ptr))
-
-/*
- * To printf a pointer in hex with a fixed width of 16 digits and a leading 0x,
- * use printf("ptr=" DPxMOD "...\n", DPxPTR(ptr));
- *
- * DPxMOD expands to:
- *   "0x%0*" PRIxPTR
- * where PRIxPTR expands to an appropriate modifier for the type uintptr_t on a
- * specific platform, e.g. "lu" if uintptr_t is typedef'd as unsigned long:
- *   "0x%0*lu"
- *
- * Ultimately, the whole statement expands to:
- *   printf("ptr=0x%0*lu...\n",  // the 0* modifier expects an extra argument
- *                               // specifying the width of the output
- *   (int)(2*sizeof(uintptr_t)), // the extra argument specifying the width
- *                               // 8 digits for 32bit systems
- *                               // 16 digits for 64bit
- *   (uintptr_t) ptr);
- */
-#else
-#define DEBUGP(prefix, ...)                                                    \
-  {}
-#endif
-
 #ifdef __cplusplus
 #define EXTERN extern "C"
 #else
diff --git a/libomptarget/include/omptargetplugin.h b/libomptarget/include/omptargetplugin.h
index e03416ccf..6785e77ed 100644
--- a/libomptarget/include/omptargetplugin.h
+++ b/libomptarget/include/omptargetplugin.h
@@ -31,6 +31,11 @@ int32_t __tgt_rtl_number_of_devices(void);
 // having to load the library, which can be expensive.
 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image);
 
+// Return an integer other than zero if the data can be exchaned from SrcDevId
+// to DstDevId. If it is data exchangable, the device plugin should provide
+// function to move data from source device to destination device directly.
+int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int32_t DstDevId);
+
 // Initialize the requires flags for the device.
 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags);
 
@@ -58,16 +63,37 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t ID,
 // case an error occurred on the target device.
 void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr);
 
-// Pass the data content to the target device using the target address.
-// In case of success, return zero. Otherwise, return an error code.
+// Pass the data content to the target device using the target address. In case
+// of success, return zero. Otherwise, return an error code.
 int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr,
                               int64_t Size);
 
-// Retrieve the data content from the target device using its address.
-// In case of success, return zero. Otherwise, return an error code.
+int32_t __tgt_rtl_data_submit_async(int32_t ID, void *TargetPtr, void *HostPtr,
+                                    int64_t Size,
+                                    __tgt_async_info *AsyncInfoPtr);
+
+// Retrieve the data content from the target device using its address. In case
+// of success, return zero. Otherwise, return an error code.
 int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr,
                                 int64_t Size);
 
+// Asynchronous version of __tgt_rtl_data_retrieve
+int32_t __tgt_rtl_data_retrieve_async(int32_t ID, void *HostPtr,
+                                      void *TargetPtr, int64_t Size,
+                                      __tgt_async_info *AsyncInfoPtr);
+
+// Copy the data content from one target device to another target device using
+// its address. This operation does not need to copy data back to host and then
+// from host to another device. In case of success, return zero. Otherwise,
+// return an error code.
+int32_t __tgt_rtl_data_exchange(int32_t SrcID, void *SrcPtr, int32_t DstID,
+                                void *DstPtr, int64_t Size);
+
+// Asynchronous version of __tgt_rtl_data_exchange
+int32_t __tgt_rtl_data_exchange_async(int32_t SrcID, void *SrcPtr,
+                                      int32_t DesID, void *DstPtr, int64_t Size,
+                                      __tgt_async_info *AsyncInfoPtr);
+
 // De-allocate the data referenced by target ptr on the device. In case of
 // success, return zero. Otherwise, return an error code.
 int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr);
@@ -75,18 +101,38 @@ int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr);
 // Transfer control to the offloaded entry Entry on the target device.
 // Args and Offsets are arrays of NumArgs size of target addresses and
 // offsets. An offset should be added to the target address before passing it
-// to the outlined function on device side. In case of success, return zero.
-// Otherwise, return an error code.
+// to the outlined function on device side. If AsyncInfoPtr is nullptr, it is
+// synchronous; otherwise it is asynchronous. However, AsyncInfoPtr may be
+// ignored on some platforms, like x86_64. In that case, it is synchronous. In
+// case of success, return zero. Otherwise, return an error code.
 int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args,
                                     ptrdiff_t *Offsets, int32_t NumArgs);
 
+// Asynchronous version of __tgt_rtl_run_target_region
+int32_t __tgt_rtl_run_target_region_async(int32_t ID, void *Entry, void **Args,
+                                          ptrdiff_t *Offsets, int32_t NumArgs,
+                                          __tgt_async_info *AsyncInfoPtr);
+
 // Similar to __tgt_rtl_run_target_region, but additionally specify the
-// number of teams to be created and a number of threads in each team.
+// number of teams to be created and a number of threads in each team. If
+// AsyncInfoPtr is nullptr, it is synchronous; otherwise it is asynchronous.
+// However, AsyncInfoPtr may be ignored on some platforms, like x86_64. In that
+// case, it is synchronous.
 int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args,
                                          ptrdiff_t *Offsets, int32_t NumArgs,
                                          int32_t NumTeams, int32_t ThreadLimit,
                                          uint64_t loop_tripcount);
 
+// Asynchronous version of __tgt_rtl_run_target_team_region
+int32_t __tgt_rtl_run_target_team_region_async(
+    int32_t ID, void *Entry, void **Args, ptrdiff_t *Offsets, int32_t NumArgs,
+    int32_t NumTeams, int32_t ThreadLimit, uint64_t loop_tripcount,
+    __tgt_async_info *AsyncInfoPtr);
+
+// Device synchronization. In case of success, return zero. Otherwise, return an
+// error code.
+int32_t __tgt_rtl_synchronize(int32_t ID, __tgt_async_info *AsyncInfoPtr);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/libomptarget/plugins/CMakeLists.txt b/libomptarget/plugins/CMakeLists.txt
index f8048ba69..7a2a90d34 100644
--- a/libomptarget/plugins/CMakeLists.txt
+++ b/libomptarget/plugins/CMakeLists.txt
@@ -1,15 +1,17 @@
 ##===----------------------------------------------------------------------===##
-# 
+#
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# 
+#
 ##===----------------------------------------------------------------------===##
 #
 # Build plugins for the user system if available.
 #
 ##===----------------------------------------------------------------------===##
 
+add_subdirectory(common)
+
 # void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id);
 # - build a plugin for an ELF based generic 64-bit target based on libffi.
 # - tmachine: name of the machine processor as used in the cmake build system.
@@ -19,36 +21,42 @@ macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_tripl
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
   if(LIBOMPTARGET_DEP_LIBELF_FOUND)
     if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
-    
+
       libomptarget_say("Building ${tmachine_name} offloading plugin.")
-    
+
       include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
       include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR})
-      
+
       # Define macro to be used as prefix of the runtime messages for this target.
       add_definitions("-DTARGET_NAME=${tmachine_name}")
-      
+
       # Define macro with the ELF ID for this target.
       add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
-    
-      add_library("omptarget.rtl.${tmachine_libname}" SHARED 
+
+      add_library("bolt-omptarget.rtl.${tmachine_libname}" SHARED
         ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp)
-        
+
       # Install plugin under the lib destination folder.
-      install(TARGETS "omptarget.rtl.${tmachine_libname}" 
+      install(TARGETS "bolt-omptarget.rtl.${tmachine_libname}"
         LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-        
+
       target_link_libraries(
-        "omptarget.rtl.${tmachine_libname}"
-        ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} 
+        "bolt-omptarget.rtl.${tmachine_libname}"
+        elf_common
+        ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES}
         ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
         dl
         "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
-    
+
+      list(APPEND LIBBOLTTARGET_TESTED_PLUGINS
+        "bolt-omptarget.rtl.${tmachine_libname}")
+
       # Report to the parent scope that we are building a plugin.
-      set(LIBOMPTARGET_SYSTEM_TARGETS 
-        "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE)
-      
+      set(LIBBOLTTARGET_SYSTEM_TARGETS
+        "${LIBBOLTTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE)
+      set(LIBBOLTTARGET_TESTED_PLUGINS
+        "${LIBBOLTTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
+
     else(LIBOMPTARGET_DEP_LIBFFI_FOUND)
       libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.")
     endif(LIBOMPTARGET_DEP_LIBFFI_FOUND)
@@ -64,8 +72,10 @@ add_subdirectory(aarch64)
 add_subdirectory(cuda)
 add_subdirectory(ppc64)
 add_subdirectory(ppc64le)
+add_subdirectory(ve)
 add_subdirectory(x86_64)
 
 # Make sure the parent scope can see the plugins that will be created.
-set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
+set(LIBBOLTTARGET_SYSTEM_TARGETS "${LIBBOLTTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
+set(LIBBOLTTARGET_TESTED_PLUGINS "${LIBBOLTTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
 
diff --git a/libomptarget/plugins/amdgpu/CMakeLists.txt b/libomptarget/plugins/amdgpu/CMakeLists.txt
new file mode 100644
index 000000000..2ac2a6652
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/CMakeLists.txt
@@ -0,0 +1,98 @@
+##===----------------------------------------------------------------------===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for an AMDGPU machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+################################################################################
+
+# as of rocm-3.7, hsa is installed with cmake packages and kmt is found via hsa
+find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
+if (NOT ${hsa-runtime64_FOUND})
+  libomptarget_say("Not building AMDGPU plugin: hsa-runtime64 not found")
+  return()
+endif()
+
+if(NOT LIBOMPTARGET_DEP_LIBELF_FOUND)
+  libomptarget_say("Not building AMDGPU plugin: LIBELF not found")
+  return()
+endif()
+
+if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux")
+  libomptarget_say("Not building AMDGPU plugin: only support AMDGPU in Linux x86_64, ppc64le, or aarch64 hosts.")
+  return()
+endif()
+
+if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
+  libomptarget_say("Not building AMDGPU plugin: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
+  return()
+endif()
+
+libomptarget_say("Building amdgpu offloading plugin")
+
+################################################################################
+# Define the suffix for the runtime messaging dumps.
+add_definitions(-DTARGET_NAME=AMDGPU)
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "(ppc64le)|(aarch64)$")
+   add_definitions(-DLITTLEENDIAN_CPU=1)
+endif()
+
+if(CMAKE_BUILD_TYPE MATCHES Debug)
+  add_definitions(-DDEBUG)
+endif()
+
+include_directories(
+  ${CMAKE_CURRENT_SOURCE_DIR}/impl
+  ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
+)
+
+add_library(bolt-omptarget.rtl.amdgpu SHARED
+      impl/atmi.cpp
+      impl/atmi_interop_hsa.cpp
+      impl/data.cpp
+      impl/get_elf_mach_gfx_name.cpp
+      impl/machine.cpp
+      impl/system.cpp
+      impl/utils.cpp
+      impl/msgpack.cpp
+      src/rtl.cpp
+      )
+
+# Install plugin under the lib destination folder.
+# When we build for debug, OPENMP_LIBDIR_SUFFIX get set to -debug
+install(TARGETS bolt-omptarget.rtl.amdgpu LIBRARY DESTINATION "lib${OPENMP_LIBDIR_SUFFIX}")
+
+# Install aliases
+get_target_property(BOLT_LIBOMPTARGET_LIBRARY_DIR bolt-omptarget.rtl.amdgpu LIBRARY_OUTPUT_DIRECTORY)
+if(BOLT_LIBOMPTARGET_LIBRARY_DIR)
+  add_custom_command(TARGET bolt-omptarget.rtl.amdgpu POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_SHARED_LIBRARY_PREFIX}bolt-omptarget.rtl.amdgpu${CMAKE_SHARED_LIBRARY_SUFFIX}
+      ${CMAKE_SHARED_LIBRARY_PREFIX}omptarget.rtl.amdgpu${CMAKE_SHARED_LIBRARY_SUFFIX}
+      WORKING_DIRECTORY ${BOLT_LIBOMPTARGET_LIBRARY_DIR}
+  )
+endif()
+install(CODE "execute_process(COMMAND \"\${CMAKE_COMMAND}\" -E create_symlink \"${CMAKE_SHARED_LIBRARY_PREFIX}bolt-omptarget.rtl.amdgpu${CMAKE_SHARED_LIBRARY_SUFFIX}\"
+  \"${CMAKE_SHARED_LIBRARY_PREFIX}omptarget.rtl.amdgpu${CMAKE_SHARED_LIBRARY_SUFFIX}\" WORKING_DIRECTORY
+  \$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/${OPENMP_INSTALL_LIBDIR})")
+
+set_property(TARGET bolt-omptarget.rtl.amdgpu PROPERTY INSTALL_RPATH "$ORIGIN")
+target_link_libraries(
+  bolt-omptarget.rtl.amdgpu
+  PRIVATE
+  elf_common
+  hsa-runtime64::hsa-runtime64
+  pthread dl elf
+  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
+  "-Wl,-z,defs"
+  )
+
+# Report to the parent scope that we are building a plugin for amdgpu
+set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa" PARENT_SCOPE)
+
diff --git a/libomptarget/plugins/amdgpu/impl/atmi.cpp b/libomptarget/plugins/amdgpu/impl/atmi.cpp
new file mode 100644
index 000000000..285dc2dbe
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/impl/atmi.cpp
@@ -0,0 +1,141 @@
+/*===--------------------------------------------------------------------------
+ *              ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#include "atmi_runtime.h"
+#include "internal.h"
+#include "rt.h"
+#include <hsa.h>
+#include <hsa_ext_amd.h>
+#include <memory>
+
+/*
+ * Initialize/Finalize
+ */
+atmi_status_t atmi_init() { return core::Runtime::Initialize(); }
+
+atmi_status_t atmi_finalize() { return core::Runtime::Finalize(); }
+
+/*
+ * Machine Info
+ */
+atmi_machine_t *atmi_machine_get_info() {
+  return core::Runtime::GetMachineInfo();
+}
+
+/*
+ * Modules
+ */
+atmi_status_t atmi_module_register_from_memory_to_place(
+    void *module_bytes, size_t module_size, atmi_place_t place,
+    atmi_status_t (*on_deserialized_data)(void *data, size_t size,
+                                          void *cb_state),
+    void *cb_state) {
+  return core::Runtime::getInstance().RegisterModuleFromMemory(
+      module_bytes, module_size, place, on_deserialized_data, cb_state);
+}
+
+/*
+ * Data
+ */
+
+static hsa_status_t invoke_hsa_copy(hsa_signal_t sig, void *dest,
+                                    const void *src, size_t size,
+                                    hsa_agent_t agent) {
+  const hsa_signal_value_t init = 1;
+  const hsa_signal_value_t success = 0;
+  hsa_signal_store_screlease(sig, init);
+
+  hsa_status_t err =
+      hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0, NULL, sig);
+  if (err != HSA_STATUS_SUCCESS) {
+    return err;
+  }
+
+  // async_copy reports success by decrementing and failure by setting to < 0
+  hsa_signal_value_t got = init;
+  while (got == init) {
+    got = hsa_signal_wait_scacquire(sig, HSA_SIGNAL_CONDITION_NE, init,
+                                    UINT64_MAX, ATMI_WAIT_STATE);
+  }
+
+  if (got != success) {
+    return HSA_STATUS_ERROR;
+  }
+
+  return err;
+}
+
+struct atmiFreePtrDeletor {
+  void operator()(void *p) {
+    atmi_free(p); // ignore failure to free
+  }
+};
+
+atmi_status_t atmi_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
+                              const void *hostSrc, size_t size,
+                              hsa_agent_t agent) {
+  hsa_status_t rc = hsa_memory_copy(deviceDest, hostSrc, size);
+
+  // hsa_memory_copy sometimes fails in situations where
+  // allocate + copy succeeds. Looks like it might be related to
+  // locking part of a read only segment. Fall back for now.
+  if (rc == HSA_STATUS_SUCCESS) {
+    return ATMI_STATUS_SUCCESS;
+  }
+
+  void *tempHostPtr;
+  atmi_mem_place_t CPU = ATMI_MEM_PLACE_CPU_MEM(0, 0, 0);
+  atmi_status_t ret = atmi_malloc(&tempHostPtr, size, CPU);
+  if (ret != ATMI_STATUS_SUCCESS) {
+    DEBUG_PRINT("atmi_malloc: Unable to alloc %d bytes for temp scratch\n",
+                size);
+    return ret;
+  }
+  std::unique_ptr<void, atmiFreePtrDeletor> del(tempHostPtr);
+  memcpy(tempHostPtr, hostSrc, size);
+
+  if (invoke_hsa_copy(signal, deviceDest, tempHostPtr, size, agent) !=
+      HSA_STATUS_SUCCESS) {
+    return ATMI_STATUS_ERROR;
+  }
+  return ATMI_STATUS_SUCCESS;
+}
+
+atmi_status_t atmi_memcpy_d2h(hsa_signal_t signal, void *dest,
+                              const void *deviceSrc, size_t size,
+                              hsa_agent_t agent) {
+  hsa_status_t rc = hsa_memory_copy(dest, deviceSrc, size);
+
+  // hsa_memory_copy sometimes fails in situations where
+  // allocate + copy succeeds. Looks like it might be related to
+  // locking part of a read only segment. Fall back for now.
+  if (rc == HSA_STATUS_SUCCESS) {
+    return ATMI_STATUS_SUCCESS;
+  }
+
+  void *tempHostPtr;
+  atmi_mem_place_t CPU = ATMI_MEM_PLACE_CPU_MEM(0, 0, 0);
+  atmi_status_t ret = atmi_malloc(&tempHostPtr, size, CPU);
+  if (ret != ATMI_STATUS_SUCCESS) {
+    DEBUG_PRINT("atmi_malloc: Unable to alloc %d bytes for temp scratch\n",
+                size);
+    return ret;
+  }
+  std::unique_ptr<void, atmiFreePtrDeletor> del(tempHostPtr);
+
+  if (invoke_hsa_copy(signal, tempHostPtr, deviceSrc, size, agent) !=
+      HSA_STATUS_SUCCESS) {
+    return ATMI_STATUS_ERROR;
+  }
+
+  memcpy(dest, tempHostPtr, size);
+  return ATMI_STATUS_SUCCESS;
+}
+
+atmi_status_t atmi_free(void *ptr) { return core::Runtime::Memfree(ptr); }
+
+atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place) {
+  return core::Runtime::Malloc(ptr, size, place);
+}
diff --git a/libomptarget/plugins/amdgpu/impl/atmi.h b/libomptarget/plugins/amdgpu/impl/atmi.h
new file mode 100644
index 000000000..35e61216c
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/impl/atmi.h
@@ -0,0 +1,194 @@
+/*===--------------------------------------------------------------------------
+ *              ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#ifndef INCLUDE_ATMI_H_
+#define INCLUDE_ATMI_H_
+
+#define ROCM_VERSION_MAJOR 3
+#define ROCM_VERSION_MINOR 2
+
+/** \defgroup enumerations Enumerated Types
+ * @{
+ */
+
+/**
+ * @brief Status codes.
+ */
+typedef enum atmi_status_t {
+  /**
+   * The function has been executed successfully.
+   */
+  ATMI_STATUS_SUCCESS = 0,
+  /**
+   * A undocumented error has occurred.
+   */
+  ATMI_STATUS_UNKNOWN = 1,
+  /**
+   * A generic error has occurred.
+   */
+  ATMI_STATUS_ERROR = 2,
+} atmi_status_t;
+
+/**
+ * @brief Device Types.
+ */
+typedef enum atmi_devtype_s {
+  ATMI_DEVTYPE_CPU = 0x0001,
+  ATMI_DEVTYPE_iGPU = 0x0010,                               // Integrated GPU
+  ATMI_DEVTYPE_dGPU = 0x0100,                               // Discrete GPU
+  ATMI_DEVTYPE_GPU = ATMI_DEVTYPE_iGPU | ATMI_DEVTYPE_dGPU, // Any GPU
+  ATMI_DEVTYPE_ALL = 0x111 // Union of all device types
+} atmi_devtype_t;
+
+/**
+ * @brief Memory Access Type.
+ */
+typedef enum atmi_memtype_s {
+  ATMI_MEMTYPE_FINE_GRAINED = 0,
+  ATMI_MEMTYPE_COARSE_GRAINED = 1,
+  ATMI_MEMTYPE_ANY
+} atmi_memtype_t;
+
+/**
+ * @brief ATMI Memory Fences for Tasks.
+ */
+typedef enum atmi_task_fence_scope_s {
+  /**
+   * No memory fence applied; external fences have to be applied around the task
+   * launch/completion.
+   */
+  ATMI_FENCE_SCOPE_NONE = 0,
+  /**
+   * The fence is applied to the device.
+   */
+  ATMI_FENCE_SCOPE_DEVICE = 1,
+  /**
+   * The fence is applied to the entire system.
+   */
+  ATMI_FENCE_SCOPE_SYSTEM = 2
+} atmi_task_fence_scope_t;
+
+/** @} */
+
+/** \defgroup common Common ATMI Structures
+ *  @{
+ */
+
+/**
+ * @brief ATMI Compute Place
+ */
+typedef struct atmi_place_s {
+  /**
+   * The node in a cluster where computation should occur.
+   * Default is node_id = 0 for local computations.
+   */
+  unsigned int node_id;
+  /**
+   * Device type: CPU, GPU or DSP
+   */
+  atmi_devtype_t type;
+  /**
+   * The device ordinal number ordered by runtime; -1 for any
+   */
+  int device_id;
+} atmi_place_t;
+
+/**
+ * @brief ATMI Memory Place
+ */
+typedef struct atmi_mem_place_s {
+  /**
+   * The node in a cluster where computation should occur.
+   * Default is node_id = 0 for local computations.
+   */
+  unsigned int node_id;
+  /**
+   * Device type: CPU, GPU or DSP
+   */
+  atmi_devtype_t dev_type;
+  /**
+   * The device ordinal number ordered by runtime; -1 for any
+   */
+  int dev_id;
+  // atmi_memtype_t mem_type;        // Fine grained or Coarse grained
+  /**
+   * The memory space/region ordinal number ordered by runtime; -1 for any
+   */
+  int mem_id;
+} atmi_mem_place_t;
+
+/**
+ * @brief ATMI Memory Space/region Structure
+ */
+typedef struct atmi_memory_s {
+  /**
+   * Memory capacity
+   */
+  unsigned long int capacity;
+  /**
+   * Memory type
+   */
+  atmi_memtype_t type;
+} atmi_memory_t;
+
+/**
+ * @brief ATMI Device Structure
+ */
+typedef struct atmi_device_s {
+  /**
+   * Device type: CPU, GPU or DSP
+   */
+  atmi_devtype_t type;
+  /**
+   * Array of memory spaces/regions that are accessible
+   * from this device.
+   */
+  atmi_memory_t *memories;
+} atmi_device_t;
+
+/**
+ * @brief ATMI Machine Structure
+ */
+typedef struct atmi_machine_s {
+  /**
+   * The number of devices categorized by the device type
+   */
+  unsigned int device_count_by_type[ATMI_DEVTYPE_ALL];
+  /**
+   * The device structures categorized by the device type
+   */
+  atmi_device_t *devices_by_type[ATMI_DEVTYPE_ALL];
+} atmi_machine_t;
+
+// Below are some helper macros that can be used to setup
+// some of the ATMI data structures.
+#define ATMI_PLACE_CPU(node, cpu_id)                                           \
+  { .node_id = node, .type = ATMI_DEVTYPE_CPU, .device_id = cpu_id }
+#define ATMI_PLACE_GPU(node, gpu_id)                                           \
+  { .node_id = node, .type = ATMI_DEVTYPE_GPU, .device_id = gpu_id }
+#define ATMI_MEM_PLACE_CPU(node, cpu_id)                                       \
+  {                                                                            \
+    .node_id = node, .dev_type = ATMI_DEVTYPE_CPU, .dev_id = cpu_id,           \
+    .mem_id = -1                                                               \
+  }
+#define ATMI_MEM_PLACE_GPU(node, gpu_id)                                       \
+  {                                                                            \
+    .node_id = node, .dev_type = ATMI_DEVTYPE_GPU, .dev_id = gpu_id,           \
+    .mem_id = -1                                                               \
+  }
+#define ATMI_MEM_PLACE_CPU_MEM(node, cpu_id, cpu_mem_id)                       \
+  {                                                                            \
+    .node_id = node, .dev_type = ATMI_DEVTYPE_CPU, .dev_id = cpu_id,           \
+    .mem_id = cpu_mem_id                                                       \
+  }
+#define ATMI_MEM_PLACE_GPU_MEM(node, gpu_id, gpu_mem_id)                       \
+  {                                                                            \
+    .node_id = node, .dev_type = ATMI_DEVTYPE_GPU, .dev_id = gpu_id,           \
+    .mem_id = gpu_mem_id                                                       \
+  }
+#define ATMI_MEM_PLACE(d_type, d_id, m_id)                                     \
+  { .node_id = 0, .dev_type = d_type, .dev_id = d_id, .mem_id = m_id }
+
+#endif // INCLUDE_ATMI_H_
diff --git a/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp b/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp
new file mode 100644
index 000000000..eb4a46c35
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp
@@ -0,0 +1,96 @@
+/*===--------------------------------------------------------------------------
+ *              ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#include "atmi_interop_hsa.h"
+#include "internal.h"
+
+using core::atl_is_atmi_initialized;
+
+atmi_status_t atmi_interop_hsa_get_symbol_info(atmi_mem_place_t place,
+                                               const char *symbol,
+                                               void **var_addr,
+                                               unsigned int *var_size) {
+  /*
+     // Typical usage:
+     void *var_addr;
+     size_t var_size;
+     atmi_interop_hsa_get_symbol_addr(gpu_place, "symbol_name", &var_addr,
+     &var_size);
+     atmi_memcpy(signal, host_add, var_addr, var_size);
+  */
+
+  if (!atl_is_atmi_initialized())
+    return ATMI_STATUS_ERROR;
+  atmi_machine_t *machine = atmi_machine_get_info();
+  if (!symbol || !var_addr || !var_size || !machine)
+    return ATMI_STATUS_ERROR;
+  if (place.dev_id < 0 ||
+      place.dev_id >= machine->device_count_by_type[place.dev_type])
+    return ATMI_STATUS_ERROR;
+
+  // get the symbol info
+  std::string symbolStr = std::string(symbol);
+  if (SymbolInfoTable[place.dev_id].find(symbolStr) !=
+      SymbolInfoTable[place.dev_id].end()) {
+    atl_symbol_info_t info = SymbolInfoTable[place.dev_id][symbolStr];
+    *var_addr = reinterpret_cast<void *>(info.addr);
+    *var_size = info.size;
+    return ATMI_STATUS_SUCCESS;
+  } else {
+    *var_addr = NULL;
+    *var_size = 0;
+    return ATMI_STATUS_ERROR;
+  }
+}
+
+atmi_status_t atmi_interop_hsa_get_kernel_info(
+    atmi_mem_place_t place, const char *kernel_name,
+    hsa_executable_symbol_info_t kernel_info, uint32_t *value) {
+  /*
+     // Typical usage:
+     uint32_t value;
+     atmi_interop_hsa_get_kernel_addr(gpu_place, "kernel_name",
+                                  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
+                                  &val);
+  */
+
+  if (!atl_is_atmi_initialized())
+    return ATMI_STATUS_ERROR;
+  atmi_machine_t *machine = atmi_machine_get_info();
+  if (!kernel_name || !value || !machine)
+    return ATMI_STATUS_ERROR;
+  if (place.dev_id < 0 ||
+      place.dev_id >= machine->device_count_by_type[place.dev_type])
+    return ATMI_STATUS_ERROR;
+
+  atmi_status_t status = ATMI_STATUS_SUCCESS;
+  // get the kernel info
+  std::string kernelStr = std::string(kernel_name);
+  if (KernelInfoTable[place.dev_id].find(kernelStr) !=
+      KernelInfoTable[place.dev_id].end()) {
+    atl_kernel_info_t info = KernelInfoTable[place.dev_id][kernelStr];
+    switch (kernel_info) {
+    case HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE:
+      *value = info.group_segment_size;
+      break;
+    case HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE:
+      *value = info.private_segment_size;
+      break;
+    case HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE:
+      // return the size for non-implicit args
+      *value = info.kernel_segment_size - sizeof(atmi_implicit_args_t);
+      break;
+    default:
+      *value = 0;
+      status = ATMI_STATUS_ERROR;
+      break;
+    }
+  } else {
+    *value = 0;
+    status = ATMI_STATUS_ERROR;
+  }
+
+  return status;
+}
diff --git a/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.h b/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.h
new file mode 100644
index 000000000..c0f588215
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.h
@@ -0,0 +1,86 @@
+/*===--------------------------------------------------------------------------
+ *              ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#ifndef INCLUDE_ATMI_INTEROP_HSA_H_
+#define INCLUDE_ATMI_INTEROP_HSA_H_
+
+#include "atmi_runtime.h"
+#include "hsa.h"
+#include "hsa_ext_amd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/** \defgroup interop_hsa_functions ATMI-HSA Interop
+ *  @{
+ */
+
+/**
+ * @brief Get the device address and size of an HSA global symbol
+ *
+ * @detail Use this function to query the device address and size of an HSA
+ * global symbol.
+ * The symbol can be set at by the compiler or by the application writer in a
+ * language-specific manner. This function is meaningful only after calling one
+ * of the @p atmi_module_register functions.
+ *
+ * @param[in] place The ATMI memory place
+ *
+ * @param[in] symbol Pointer to a non-NULL global symbol name
+ *
+ * @param[in] var_addr Pointer to a non-NULL @p void* variable that will
+ * hold the device address of the global symbol object.
+ *
+ * @param[in] var_size Pointer to a non-NULL @p uint variable that will
+ * hold the size of the global symbol object.
+ *
+ * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
+ *
+ * @retval ::ATMI_STATUS_ERROR If @p symbol, @p var_addr or @p var_size are
+ * invalid
+ * location in the current node, or if ATMI is not initialized.
+ *
+ * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
+ */
+atmi_status_t atmi_interop_hsa_get_symbol_info(atmi_mem_place_t place,
+                                               const char *symbol,
+                                               void **var_addr,
+                                               unsigned int *var_size);
+
+/**
+ * @brief Get the HSA-specific kernel info from a kernel name
+ *
+ * @detail Use this function to query the HSA-specific kernel info from the
+ * kernel name.
+ * This function is meaningful only after calling one
+ * of the @p atmi_module_register functions.
+ *
+ * @param[in] place The ATMI memory place
+ *
+ * @param[in] kernel_name Pointer to a char array with the kernel name
+ *
+ * @param[in] info The different possible kernel properties
+ *
+ * @param[in] value Pointer to a non-NULL @p uint variable that will
+ * hold the return value of the kernel property.
+ *
+ * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
+ *
+ * @retval ::ATMI_STATUS_ERROR If @p symbol, @p var_addr or @p var_size are
+ * invalid
+ * location in the current node, or if ATMI is not initialized.
+ *
+ * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
+ */
+atmi_status_t atmi_interop_hsa_get_kernel_info(
+    atmi_mem_place_t place, const char *kernel_name,
+    hsa_executable_symbol_info_t info, uint32_t *value);
+/** @} */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // INCLUDE_ATMI_INTEROP_HSA_H_
diff --git a/libomptarget/plugins/amdgpu/impl/atmi_runtime.h b/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
new file mode 100644
index 000000000..47022f7f5
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
@@ -0,0 +1,172 @@
+/*===--------------------------------------------------------------------------
+ *              ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#ifndef INCLUDE_ATMI_RUNTIME_H_
+#define INCLUDE_ATMI_RUNTIME_H_
+
+#include "atmi.h"
+#include "hsa.h"
+#include <inttypes.h>
+#include <stdlib.h>
+#ifndef __cplusplus
+#include <stdbool.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \defgroup context_functions ATMI Context Setup and Finalize
+ *  @{
+ */
+/**
+ * @brief Initialize the ATMI runtime environment.
+ *
+ * @detal All ATMI runtime functions will fail if this function is not called
+ * at least once. The user may initialize difference device types at different
+ * regions in the program in order for optimization purposes.
+ *
+ * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
+ *
+ * @retval ::ATMI_STATUS_ERROR The function encountered errors.
+ *
+ * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
+ */
+atmi_status_t atmi_init();
+
+/**
+ * @brief Finalize the ATMI runtime environment.
+ *
+ * @detail ATMI runtime functions will fail if called after finalize.
+ *
+ * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
+ *
+ * @retval ::ATMI_STATUS_ERROR The function encountered errors.
+ *
+ * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
+ */
+atmi_status_t atmi_finalize();
+/** @} */
+
+/** \defgroup module_functions ATMI Module
+ * @{
+ */
+
+/**
+ * @brief Register the ATMI code module from memory on to a specific place
+ * (device).
+ *
+ * @detail Currently, only GPU devices need explicit module registration because
+ * of their specific ISAs that require a separate compilation phase. On the
+ * other
+ * hand, CPU devices execute regular x86 functions that are compiled with the
+ * host program.
+ *
+ * @param[in] module_bytes A memory region that contains the GPU modules
+ * targeting ::AMDGCN platform types. Value cannot be NULL.
+ *
+ * @param[in] module_size Size of module region
+ *
+ * @param[in] place Denotes the execution place (device) on which the module
+ * should be registered and loaded.
+ *
+ * @param[in] on_deserialized_data Callback run on deserialized code object,
+ * before loading it
+ *
+ * @param[in] cb_state void* passed to on_deserialized_data callback
+ *
+ * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
+ *
+ * @retval ::ATMI_STATUS_ERROR The function encountered errors.
+ *
+ * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
+ *
+ */
+atmi_status_t atmi_module_register_from_memory_to_place(
+    void *module_bytes, size_t module_size, atmi_place_t place,
+    atmi_status_t (*on_deserialized_data)(void *data, size_t size,
+                                          void *cb_state),
+    void *cb_state);
+
+/** @} */
+
+/** \defgroup machine ATMI Machine
+ * @{
+ */
+/**
+ * @brief ATMI's device discovery function to get the current machine's
+ * topology.
+ *
+ * @detail The @p atmi_machine_t structure is a tree-based representation of the
+ * compute and memory elements in the current node. Once ATMI is initialized,
+ * this function can be called to retrieve the pointer to this global structure.
+ *
+ * @return Returns a pointer to a global structure of tyoe @p atmi_machine_t.
+ * Returns NULL if ATMI is not initialized.
+ */
+atmi_machine_t *atmi_machine_get_info();
+/** @} */
+
+/** \defgroup memory_functions ATMI Data Management
+ * @{
+ */
+/**
+ * @brief Allocate memory from the specified memory place.
+ *
+ * @detail This function allocates memory from the specified memory place. If
+ * the memory
+ * place belongs primarily to the CPU, then the memory will be accessible by
+ * other GPUs and CPUs in the system. If the memory place belongs primarily to a
+ * GPU,
+ * then it cannot be accessed by other devices in the system.
+ *
+ * @param[in] ptr The pointer to the memory that will be allocated.
+ *
+ * @param[in] size The size of the allocation in bytes.
+ *
+ * @param[in] place The memory place in the system to perform the allocation.
+ *
+ * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
+ *
+ * @retval ::ATMI_STATUS_ERROR The function encountered errors.
+ *
+ * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
+ *
+ */
+atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place);
+
+/**
+ * @brief Frees memory that was previously allocated.
+ *
+ * @detail This function frees memory that was previously allocated by calling
+ * @p atmi_malloc. It throws an error otherwise. It is illegal to access a
+ * pointer after a call to this function.
+ *
+ * @param[in] ptr The pointer to the memory that has to be freed.
+ *
+ * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
+ *
+ * @retval ::ATMI_STATUS_ERROR The function encountered errors.
+ *
+ * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
+ *
+ */
+atmi_status_t atmi_free(void *ptr);
+
+atmi_status_t atmi_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
+                              const void *hostSrc, size_t size,
+                              hsa_agent_t agent);
+
+atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *hostDest,
+                              const void *deviceSrc, size_t size,
+                              hsa_agent_t agent);
+
+/** @} */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // INCLUDE_ATMI_RUNTIME_H_
diff --git a/libomptarget/plugins/amdgpu/impl/data.cpp b/libomptarget/plugins/amdgpu/impl/data.cpp
new file mode 100644
index 000000000..39546fbae
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/impl/data.cpp
@@ -0,0 +1,85 @@
+/*===--------------------------------------------------------------------------
+ *              ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#include "atmi_runtime.h"
+#include "internal.h"
+#include "machine.h"
+#include "rt.h"
+#include <cassert>
+#include <hsa.h>
+#include <hsa_ext_amd.h>
+#include <iostream>
+#include <stdio.h>
+#include <string.h>
+#include <thread>
+#include <vector>
+
+using core::TaskImpl;
+extern ATLMachine g_atl_machine;
+
+namespace core {
+void allow_access_to_all_gpu_agents(void *ptr);
+
+const char *getPlaceStr(atmi_devtype_t type) {
+  switch (type) {
+  case ATMI_DEVTYPE_CPU:
+    return "CPU";
+  case ATMI_DEVTYPE_GPU:
+    return "GPU";
+  default:
+    return NULL;
+  }
+}
+
+ATLProcessor &get_processor_by_mem_place(atmi_mem_place_t place) {
+  int dev_id = place.dev_id;
+  switch (place.dev_type) {
+  case ATMI_DEVTYPE_CPU:
+    return g_atl_machine.processors<ATLCPUProcessor>()[dev_id];
+  case ATMI_DEVTYPE_GPU:
+    return g_atl_machine.processors<ATLGPUProcessor>()[dev_id];
+  }
+}
+
+hsa_amd_memory_pool_t get_memory_pool_by_mem_place(atmi_mem_place_t place) {
+  ATLProcessor &proc = get_processor_by_mem_place(place);
+  return get_memory_pool(proc, place.mem_id);
+}
+
+void register_allocation(void *ptr, size_t size, atmi_mem_place_t place) {
+  if (place.dev_type == ATMI_DEVTYPE_CPU)
+    allow_access_to_all_gpu_agents(ptr);
+  // TODO(ashwinma): what if one GPU wants to access another GPU?
+}
+
+atmi_status_t Runtime::Malloc(void **ptr, size_t size, atmi_mem_place_t place) {
+  atmi_status_t ret = ATMI_STATUS_SUCCESS;
+  hsa_amd_memory_pool_t pool = get_memory_pool_by_mem_place(place);
+  hsa_status_t err = hsa_amd_memory_pool_allocate(pool, size, 0, ptr);
+  ErrorCheck(atmi_malloc, err);
+  DEBUG_PRINT("Malloced [%s %d] %p\n",
+              place.dev_type == ATMI_DEVTYPE_CPU ? "CPU" : "GPU", place.dev_id,
+              *ptr);
+  if (err != HSA_STATUS_SUCCESS)
+    ret = ATMI_STATUS_ERROR;
+
+  register_allocation(*ptr, size, place);
+
+  return ret;
+}
+
+atmi_status_t Runtime::Memfree(void *ptr) {
+  atmi_status_t ret = ATMI_STATUS_SUCCESS;
+  hsa_status_t err;
+  err = hsa_amd_memory_pool_free(ptr);
+  ErrorCheck(atmi_free, err);
+  DEBUG_PRINT("Freed %p\n", ptr);
+
+  if (err != HSA_STATUS_SUCCESS)
+    ret = ATMI_STATUS_ERROR;
+  return ret;
+}
+
+} // namespace core
diff --git a/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp b/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp
new file mode 100644
index 000000000..45af34684
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp
@@ -0,0 +1,53 @@
+#include "get_elf_mach_gfx_name.h"
+
+// This header conflicts with the system elf.h (macros vs enums of the same
+// identifier) and contains more up to date values for the enum checked here.
+// rtl.cpp uses the system elf.h.
+#include "llvm/BinaryFormat/ELF.h"
+
+const char *get_elf_mach_gfx_name(uint32_t EFlags) {
+  using namespace llvm::ELF;
+  uint32_t Gfx = (EFlags & EF_AMDGPU_MACH);
+  switch (Gfx) {
+  case EF_AMDGPU_MACH_AMDGCN_GFX801:
+    return "gfx801";
+  case EF_AMDGPU_MACH_AMDGCN_GFX802:
+    return "gfx802";
+  case EF_AMDGPU_MACH_AMDGCN_GFX803:
+    return "gfx803";
+  case EF_AMDGPU_MACH_AMDGCN_GFX805:
+    return "gfx805";
+  case EF_AMDGPU_MACH_AMDGCN_GFX810:
+    return "gfx810";
+  case EF_AMDGPU_MACH_AMDGCN_GFX900:
+    return "gfx900";
+  case EF_AMDGPU_MACH_AMDGCN_GFX902:
+    return "gfx902";
+  case EF_AMDGPU_MACH_AMDGCN_GFX904:
+    return "gfx904";
+  case EF_AMDGPU_MACH_AMDGCN_GFX906:
+    return "gfx906";
+  case EF_AMDGPU_MACH_AMDGCN_GFX908:
+    return "gfx908";
+  case EF_AMDGPU_MACH_AMDGCN_GFX909:
+    return "gfx909";
+  case EF_AMDGPU_MACH_AMDGCN_GFX90C:
+    return "gfx90c";
+  case EF_AMDGPU_MACH_AMDGCN_GFX1010:
+    return "gfx1010";
+  case EF_AMDGPU_MACH_AMDGCN_GFX1011:
+    return "gfx1011";
+  case EF_AMDGPU_MACH_AMDGCN_GFX1012:
+    return "gfx1012";
+  case EF_AMDGPU_MACH_AMDGCN_GFX1030:
+    return "gfx1030";
+  case EF_AMDGPU_MACH_AMDGCN_GFX1031:
+    return "gfx1031";
+  case EF_AMDGPU_MACH_AMDGCN_GFX1032:
+    return "gfx1032";
+  case EF_AMDGPU_MACH_AMDGCN_GFX1033:
+    return "gfx1033";
+  default:
+    return "--unknown gfx";
+  }
+}
diff --git a/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h b/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h
new file mode 100644
index 000000000..b1be90dc2
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h
@@ -0,0 +1,8 @@
+#ifndef GET_ELF_MACH_GFX_NAME_H_INCLUDED
+#define GET_ELF_MACH_GFX_NAME_H_INCLUDED
+
+#include <stdint.h>
+
+const char *get_elf_mach_gfx_name(uint32_t EFlags);
+
+#endif
diff --git a/libomptarget/plugins/amdgpu/impl/internal.h b/libomptarget/plugins/amdgpu/impl/internal.h
new file mode 100644
index 000000000..1b1d69328
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/impl/internal.h
@@ -0,0 +1,266 @@
+/*===--------------------------------------------------------------------------
+ *              ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#ifndef SRC_RUNTIME_INCLUDE_INTERNAL_H_
+#define SRC_RUNTIME_INCLUDE_INTERNAL_H_
+#include <inttypes.h>
+#include <pthread.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <atomic>
+#include <cstring>
+#include <deque>
+#include <map>
+#include <queue>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "hsa.h"
+#include "hsa_ext_amd.h"
+#include "hsa_ext_finalize.h"
+
+#include "atmi.h"
+#include "atmi_runtime.h"
+#include "rt.h"
+
+#define MAX_NUM_KERNELS (1024 * 16)
+
+typedef struct atmi_implicit_args_s {
+  unsigned long offset_x;
+  unsigned long offset_y;
+  unsigned long offset_z;
+  unsigned long hostcall_ptr;
+  char num_gpu_queues;
+  unsigned long gpu_queue_ptr;
+  char num_cpu_queues;
+  unsigned long cpu_worker_signals;
+  unsigned long cpu_queue_ptr;
+  unsigned long kernarg_template_ptr;
+} atmi_implicit_args_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define check(msg, status)                                                     \
+  if (status != HSA_STATUS_SUCCESS) {                                          \
+    printf("%s failed.\n", #msg);                                              \
+    exit(1);                                                                   \
+  }
+
+#ifdef DEBUG
+#define DEBUG_PRINT(fmt, ...)                                                  \
+  if (core::Runtime::getInstance().getDebugMode()) {                           \
+    fprintf(stderr, "[%s:%d] " fmt, __FILE__, __LINE__, ##__VA_ARGS__);        \
+  }
+#else
+#define DEBUG_PRINT(...)                                                       \
+  do {                                                                         \
+  } while (false)
+#endif
+
+#ifndef HSA_RUNTIME_INC_HSA_H_
+typedef struct hsa_signal_s {
+  uint64_t handle;
+} hsa_signal_t;
+#endif
+
+/*  All global values go in this global structure */
+typedef struct atl_context_s {
+  bool struct_initialized;
+  bool g_hsa_initialized;
+  bool g_gpu_initialized;
+  bool g_tasks_initialized;
+} atl_context_t;
+extern atl_context_t atlc;
+extern atl_context_t *atlc_p;
+
+#ifdef __cplusplus
+}
+#endif
+
+/* ---------------------------------------------------------------------------------
+ * Simulated CPU Data Structures and API
+ * ---------------------------------------------------------------------------------
+ */
+
+#define ATMI_WAIT_STATE HSA_WAIT_STATE_BLOCKED
+
+// ---------------------- Kernel Start -------------
+typedef struct atl_kernel_info_s {
+  uint64_t kernel_object;
+  uint32_t group_segment_size;
+  uint32_t private_segment_size;
+  uint32_t kernel_segment_size;
+  uint32_t num_args;
+  std::vector<uint64_t> arg_alignments;
+  std::vector<uint64_t> arg_offsets;
+  std::vector<uint64_t> arg_sizes;
+} atl_kernel_info_t;
+
+typedef struct atl_symbol_info_s {
+  uint64_t addr;
+  uint32_t size;
+} atl_symbol_info_t;
+
+extern std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
+extern std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
+
+// ---------------------- Kernel End -------------
+
+extern struct timespec context_init_time;
+
+namespace core {
+class TaskgroupImpl;
+class TaskImpl;
+class Kernel;
+class KernelImpl;
+} // namespace core
+
+struct SignalPoolT {
+  SignalPoolT() {
+    // If no signals are created, and none can be created later,
+    // will ultimately fail at pop()
+
+    unsigned N = 1024; // default max pool size from atmi
+    for (unsigned i = 0; i < N; i++) {
+      hsa_signal_t new_signal;
+      hsa_status_t err = hsa_signal_create(0, 0, NULL, &new_signal);
+      if (err != HSA_STATUS_SUCCESS) {
+        break;
+      }
+      state.push(new_signal);
+    }
+    DEBUG_PRINT("Signal Pool Initial Size: %lu\n", state.size());
+  }
+  SignalPoolT(const SignalPoolT &) = delete;
+  SignalPoolT(SignalPoolT &&) = delete;
+  ~SignalPoolT() {
+    size_t N = state.size();
+    for (size_t i = 0; i < N; i++) {
+      hsa_signal_t signal = state.front();
+      state.pop();
+      hsa_status_t rc = hsa_signal_destroy(signal);
+      if (rc != HSA_STATUS_SUCCESS) {
+        DEBUG_PRINT("Signal pool destruction failed\n");
+      }
+    }
+  }
+  size_t size() {
+    lock l(&mutex);
+    return state.size();
+  }
+  void push(hsa_signal_t s) {
+    lock l(&mutex);
+    state.push(s);
+  }
+  hsa_signal_t pop(void) {
+    lock l(&mutex);
+    if (!state.empty()) {
+      hsa_signal_t res = state.front();
+      state.pop();
+      return res;
+    }
+
+    // Pool empty, attempt to create another signal
+    hsa_signal_t new_signal;
+    hsa_status_t err = hsa_signal_create(0, 0, NULL, &new_signal);
+    if (err == HSA_STATUS_SUCCESS) {
+      return new_signal;
+    }
+
+    // Fail
+    return {0};
+  }
+
+private:
+  static pthread_mutex_t mutex;
+  std::queue<hsa_signal_t> state;
+  struct lock {
+    lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); }
+    ~lock() { pthread_mutex_unlock(m); }
+    pthread_mutex_t *m;
+  };
+};
+
+extern std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools;
+
+namespace core {
+atmi_status_t atl_init_gpu_context();
+
+hsa_status_t init_hsa();
+hsa_status_t finalize_hsa();
+/*
+ * Generic utils
+ */
+template <typename T> inline T alignDown(T value, size_t alignment) {
+  return (T)(value & ~(alignment - 1));
+}
+
+template <typename T> inline T *alignDown(T *value, size_t alignment) {
+  return reinterpret_cast<T *>(alignDown((intptr_t)value, alignment));
+}
+
+template <typename T> inline T alignUp(T value, size_t alignment) {
+  return alignDown((T)(value + alignment - 1), alignment);
+}
+
+template <typename T> inline T *alignUp(T *value, size_t alignment) {
+  return reinterpret_cast<T *>(
+      alignDown((intptr_t)(value + alignment - 1), alignment));
+}
+
+extern void register_allocation(void *addr, size_t size,
+                                atmi_mem_place_t place);
+extern hsa_amd_memory_pool_t
+get_memory_pool_by_mem_place(atmi_mem_place_t place);
+extern bool atl_is_atmi_initialized();
+
+bool handle_group_signal(hsa_signal_value_t value, void *arg);
+
+void packet_store_release(uint32_t *packet, uint16_t header, uint16_t rest);
+uint16_t
+create_header(hsa_packet_type_t type, int barrier,
+              atmi_task_fence_scope_t acq_fence = ATMI_FENCE_SCOPE_SYSTEM,
+              atmi_task_fence_scope_t rel_fence = ATMI_FENCE_SCOPE_SYSTEM);
+
+void allow_access_to_all_gpu_agents(void *ptr);
+} // namespace core
+
+const char *get_error_string(hsa_status_t err);
+const char *get_atmi_error_string(atmi_status_t err);
+
+#define ATMIErrorCheck(msg, status)                                            \
+  if (status != ATMI_STATUS_SUCCESS) {                                         \
+    printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, #msg,                \
+           get_atmi_error_string(status));                                     \
+    exit(1);                                                                   \
+  } else {                                                                     \
+    /*  printf("%s succeeded.\n", #msg);*/                                     \
+  }
+
+#define ErrorCheck(msg, status)                                                \
+  if (status != HSA_STATUS_SUCCESS) {                                          \
+    printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, #msg,                \
+           get_error_string(status));                                          \
+    exit(1);                                                                   \
+  } else {                                                                     \
+    /*  printf("%s succeeded.\n", #msg);*/                                     \
+  }
+
+#define ErrorCheckAndContinue(msg, status)                                     \
+  if (status != HSA_STATUS_SUCCESS) {                                          \
+    DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, #msg,           \
+                get_error_string(status));                                     \
+    continue;                                                                  \
+  } else {                                                                     \
+    /*  printf("%s succeeded.\n", #msg);*/                                     \
+  }
+
+#endif // SRC_RUNTIME_INCLUDE_INTERNAL_H_
diff --git a/libomptarget/plugins/amdgpu/impl/machine.cpp b/libomptarget/plugins/amdgpu/impl/machine.cpp
new file mode 100644
index 000000000..ff8ac1c35
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/impl/machine.cpp
@@ -0,0 +1,56 @@
+/*===--------------------------------------------------------------------------
+ *              ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#include "machine.h"
+#include "atmi_runtime.h"
+#include "internal.h"
+#include <cassert>
+#include <hsa.h>
+#include <hsa_ext_amd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+extern ATLMachine g_atl_machine;
+extern hsa_region_t atl_cpu_kernarg_region;
+
+void ATLProcessor::addMemory(const ATLMemory &mem) {
+  for (auto &mem_obj : memories_) {
+    // if the memory already exists, then just return
+    if (mem.memory().handle == mem_obj.memory().handle)
+      return;
+  }
+  memories_.push_back(mem);
+}
+
+const std::vector<ATLMemory> &ATLProcessor::memories() const {
+  return memories_;
+}
+
+template <> std::vector<ATLCPUProcessor> &ATLMachine::processors() {
+  return cpu_processors_;
+}
+
+template <> std::vector<ATLGPUProcessor> &ATLMachine::processors() {
+  return gpu_processors_;
+}
+
+hsa_amd_memory_pool_t get_memory_pool(const ATLProcessor &proc,
+                                      const int mem_id) {
+  hsa_amd_memory_pool_t pool;
+  const std::vector<ATLMemory> &mems = proc.memories();
+  assert(mems.size() && mem_id >= 0 && mem_id < mems.size() &&
+         "Invalid memory pools for this processor");
+  pool = mems[mem_id].memory();
+  return pool;
+}
+
+template <> void ATLMachine::addProcessor(const ATLCPUProcessor &p) {
+  cpu_processors_.push_back(p);
+}
+
+template <> void ATLMachine::addProcessor(const ATLGPUProcessor &p) {
+  gpu_processors_.push_back(p);
+}
diff --git a/libomptarget/plugins/amdgpu/impl/machine.h b/libomptarget/plugins/amdgpu/impl/machine.h
new file mode 100644
index 000000000..93169ed4e
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/impl/machine.h
@@ -0,0 +1,95 @@
+/*===--------------------------------------------------------------------------
+ *              ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#ifndef SRC_RUNTIME_INCLUDE_MACHINE_H_
+#define SRC_RUNTIME_INCLUDE_MACHINE_H_
+#include "atmi.h"
+#include "internal.h"
+#include <hsa.h>
+#include <hsa_ext_amd.h>
+#include <vector>
+
+class ATLMemory;
+
+class ATLProcessor {
+public:
+  explicit ATLProcessor(hsa_agent_t agent,
+                        atmi_devtype_t type = ATMI_DEVTYPE_ALL)
+      : agent_(agent), type_(type) {
+    memories_.clear();
+  }
+  void addMemory(const ATLMemory &p);
+  hsa_agent_t agent() const { return agent_; }
+  // TODO(ashwinma): Do we need this or are we building the machine structure
+  // just once in the program?
+  // void removeMemory(ATLMemory &p);
+  const std::vector<ATLMemory> &memories() const;
+  atmi_devtype_t type() const { return type_; }
+
+protected:
+  hsa_agent_t agent_;
+  atmi_devtype_t type_;
+  std::vector<ATLMemory> memories_;
+};
+
+class ATLCPUProcessor : public ATLProcessor {
+public:
+  explicit ATLCPUProcessor(hsa_agent_t agent)
+      : ATLProcessor(agent, ATMI_DEVTYPE_CPU) {}
+};
+
+class ATLGPUProcessor : public ATLProcessor {
+public:
+  explicit ATLGPUProcessor(hsa_agent_t agent,
+                           atmi_devtype_t type = ATMI_DEVTYPE_dGPU)
+      : ATLProcessor(agent, type) {}
+};
+
+class ATLMemory {
+public:
+  ATLMemory(hsa_amd_memory_pool_t pool, ATLProcessor p, atmi_memtype_t t)
+      : memory_pool_(pool), processor_(p), type_(t) {}
+  hsa_amd_memory_pool_t memory() const { return memory_pool_; }
+
+  atmi_memtype_t type() const { return type_; }
+
+private:
+  hsa_amd_memory_pool_t memory_pool_;
+  ATLProcessor processor_;
+  atmi_memtype_t type_;
+};
+
+class ATLMachine {
+public:
+  ATLMachine() {
+    cpu_processors_.clear();
+    gpu_processors_.clear();
+  }
+  template <typename T> void addProcessor(const T &p);
+  template <typename T> std::vector<T> &processors();
+  template <typename T> size_t processorCount() {
+    return processors<T>().size();
+  }
+
+private:
+  std::vector<ATLCPUProcessor> cpu_processors_;
+  std::vector<ATLGPUProcessor> gpu_processors_;
+};
+
+hsa_amd_memory_pool_t get_memory_pool(const ATLProcessor &proc,
+                                      const int mem_id);
+
+extern ATLMachine g_atl_machine;
+template <typename T> T &get_processor(atmi_place_t place) {
+  int dev_id = place.device_id;
+  if (dev_id == -1) {
+    // user is asking runtime to pick a device
+    // TODO(ashwinma): best device of this type? pick 0 for now
+    dev_id = 0;
+  }
+  return g_atl_machine.processors<T>()[dev_id];
+}
+
+#endif // SRC_RUNTIME_INCLUDE_MACHINE_H_
diff --git a/libomptarget/plugins/amdgpu/impl/msgpack.cpp b/libomptarget/plugins/amdgpu/impl/msgpack.cpp
new file mode 100644
index 000000000..6da12f937
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/impl/msgpack.cpp
@@ -0,0 +1,264 @@
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <string>
+
+#include "msgpack.h"
+
+namespace msgpack {
+
+[[noreturn]] void internal_error() {
+  printf("internal error\n");
+  exit(1);
+}
+
+const char *type_name(type ty) {
+  switch (ty) {
+#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER)                                  \
+  case NAME:                                                                   \
+    return #NAME;
+#include "msgpack.def"
+#undef X
+  }
+  internal_error();
+}
+
+unsigned bytes_used_fixed(msgpack::type ty) {
+  using namespace msgpack;
+  switch (ty) {
+#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER)                                  \
+  case NAME:                                                                   \
+    return WIDTH;
+#include "msgpack.def"
+#undef X
+  }
+  internal_error();
+}
+
+msgpack::type parse_type(unsigned char x) {
+
+#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER)                                  \
+  if (x >= LOWER && x <= UPPER) {                                              \
+    return NAME;                                                               \
+  } else
+#include "msgpack.def"
+#undef X
+  { internal_error(); }
+}
+
+template <typename T, typename R> R bitcast(T x) {
+  static_assert(sizeof(T) == sizeof(R), "");
+  R tmp;
+  memcpy(&tmp, &x, sizeof(T));
+  return tmp;
+}
+template int64_t bitcast<uint64_t, int64_t>(uint64_t);
+} // namespace msgpack
+
+// Helper functions for reading additional payload from the header
+// Depending on the type, this can be a number of bytes, elements,
+// key-value pairs or an embedded integer.
+// Each takes a pointer to the start of the header and returns a uint64_t
+
+namespace {
+namespace payload {
+uint64_t read_zero(const unsigned char *) { return 0; }
+
+// Read the first byte and zero/sign extend it
+uint64_t read_embedded_u8(const unsigned char *start) { return start[0]; }
+uint64_t read_embedded_s8(const unsigned char *start) {
+  int64_t res = msgpack::bitcast<uint8_t, int8_t>(start[0]);
+  return msgpack::bitcast<int64_t, uint64_t>(res);
+}
+
+// Read a masked part of the first byte
+uint64_t read_via_mask_0x1(const unsigned char *start) { return *start & 0x1u; }
+uint64_t read_via_mask_0xf(const unsigned char *start) { return *start & 0xfu; }
+uint64_t read_via_mask_0x1f(const unsigned char *start) {
+  return *start & 0x1fu;
+}
+
+// Read 1/2/4/8 bytes immediately following the type byte and zero/sign extend
+// Big endian format.
+uint64_t read_size_field_u8(const unsigned char *from) {
+  from++;
+  return from[0];
+}
+
+// TODO: detect whether host is little endian or not, and whether the intrinsic
+// is available. And probably use the builtin to test the diy
+const bool use_bswap = false;
+
+uint64_t read_size_field_u16(const unsigned char *from) {
+  from++;
+  if (use_bswap) {
+    uint16_t b;
+    memcpy(&b, from, 2);
+    return __builtin_bswap16(b);
+  } else {
+    return (from[0] << 8u) | from[1];
+  }
+}
+uint64_t read_size_field_u32(const unsigned char *from) {
+  from++;
+  if (use_bswap) {
+    uint32_t b;
+    memcpy(&b, from, 4);
+    return __builtin_bswap32(b);
+  } else {
+    return (from[0] << 24u) | (from[1] << 16u) | (from[2] << 8u) |
+           (from[3] << 0u);
+  }
+}
+uint64_t read_size_field_u64(const unsigned char *from) {
+  from++;
+  if (use_bswap) {
+    uint64_t b;
+    memcpy(&b, from, 8);
+    return __builtin_bswap64(b);
+  } else {
+    return ((uint64_t)from[0] << 56u) | ((uint64_t)from[1] << 48u) |
+           ((uint64_t)from[2] << 40u) | ((uint64_t)from[3] << 32u) |
+           (from[4] << 24u) | (from[5] << 16u) | (from[6] << 8u) |
+           (from[7] << 0u);
+  }
+}
+
+uint64_t read_size_field_s8(const unsigned char *from) {
+  uint8_t u = read_size_field_u8(from);
+  int64_t res = msgpack::bitcast<uint8_t, int8_t>(u);
+  return msgpack::bitcast<int64_t, uint64_t>(res);
+}
+uint64_t read_size_field_s16(const unsigned char *from) {
+  uint16_t u = read_size_field_u16(from);
+  int64_t res = msgpack::bitcast<uint16_t, int16_t>(u);
+  return msgpack::bitcast<int64_t, uint64_t>(res);
+}
+uint64_t read_size_field_s32(const unsigned char *from) {
+  uint32_t u = read_size_field_u32(from);
+  int64_t res = msgpack::bitcast<uint32_t, int32_t>(u);
+  return msgpack::bitcast<int64_t, uint64_t>(res);
+}
+uint64_t read_size_field_s64(const unsigned char *from) {
+  uint64_t u = read_size_field_u64(from);
+  int64_t res = msgpack::bitcast<uint64_t, int64_t>(u);
+  return msgpack::bitcast<int64_t, uint64_t>(res);
+}
+} // namespace payload
+} // namespace
+
+namespace msgpack {
+
+payload_info_t payload_info(msgpack::type ty) {
+  using namespace msgpack;
+  switch (ty) {
+#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER)                                  \
+  case NAME:                                                                   \
+    return payload::PAYLOAD;
+#include "msgpack.def"
+#undef X
+  }
+  internal_error();
+}
+
+} // namespace msgpack
+
+const unsigned char *msgpack::skip_next_message(const unsigned char *start,
+                                                const unsigned char *end) {
+  class f : public functors_defaults<f> {};
+  return handle_msgpack({start, end}, f());
+}
+
+namespace msgpack {
+bool message_is_string(byte_range bytes, const char *needle) {
+  bool matched = false;
+  size_t needleN = strlen(needle);
+
+  foronly_string(bytes, [=, &matched](size_t N, const unsigned char *str) {
+    if (N == needleN) {
+      if (memcmp(needle, str, N) == 0) {
+        matched = true;
+      }
+    }
+  });
+  return matched;
+}
+
+void dump(byte_range bytes) {
+  struct inner : functors_defaults<inner> {
+    inner(unsigned indent) : indent(indent) {}
+    const unsigned by = 2;
+    unsigned indent = 0;
+
+    void handle_string(size_t N, const unsigned char *bytes) {
+      char *tmp = (char *)malloc(N + 1);
+      memcpy(tmp, bytes, N);
+      tmp[N] = '\0';
+      printf("\"%s\"", tmp);
+      free(tmp);
+    }
+
+    void handle_signed(int64_t x) { printf("%ld", x); }
+    void handle_unsigned(uint64_t x) { printf("%lu", x); }
+
+    const unsigned char *handle_array(uint64_t N, byte_range bytes) {
+      printf("\n%*s[\n", indent, "");
+      indent += by;
+
+      for (uint64_t i = 0; i < N; i++) {
+        indent += by;
+        printf("%*s", indent, "");
+        const unsigned char *next = handle_msgpack<inner>(bytes, {indent});
+        printf(",\n");
+        indent -= by;
+        bytes.start = next;
+        if (!next) {
+          break;
+        }
+      }
+      indent -= by;
+      printf("%*s]", indent, "");
+
+      return bytes.start;
+    }
+
+    const unsigned char *handle_map(uint64_t N, byte_range bytes) {
+      printf("\n%*s{\n", indent, "");
+      indent += by;
+
+      for (uint64_t i = 0; i < 2 * N; i += 2) {
+        const unsigned char *start_key = bytes.start;
+        printf("%*s", indent, "");
+        const unsigned char *end_key =
+            handle_msgpack<inner>({start_key, bytes.end}, {indent});
+        if (!end_key) {
+          break;
+        }
+
+        printf(" : ");
+
+        const unsigned char *start_value = end_key;
+        const unsigned char *end_value =
+            handle_msgpack<inner>({start_value, bytes.end}, {indent});
+
+        if (!end_value) {
+          break;
+        }
+
+        printf(",\n");
+        bytes.start = end_value;
+      }
+
+      indent -= by;
+      printf("%*s}", indent, "");
+
+      return bytes.start;
+    }
+  };
+
+  handle_msgpack<inner>(bytes, {0});
+  printf("\n");
+}
+
+} // namespace msgpack
diff --git a/libomptarget/plugins/amdgpu/impl/msgpack.def b/libomptarget/plugins/amdgpu/impl/msgpack.def
new file mode 100644
index 000000000..a686c5a2f
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/impl/msgpack.def
@@ -0,0 +1,38 @@
+// name, header width, reader, [lower, upper] encoding
+X(posfixint, 1, read_embedded_u8, 0x00, 0x7f)
+X(negfixint, 1, read_embedded_s8, 0xe0, 0xff)
+X(fixmap, 1, read_via_mask_0xf, 0x80, 0x8f)
+X(fixarray, 1, read_via_mask_0xf, 0x90, 0x9f)
+X(fixstr, 1, read_via_mask_0x1f, 0xa0, 0xbf)
+X(nil, 1, read_zero, 0xc0, 0xc0)
+X(never_used, 1, read_zero, 0xc1, 0xc1)
+X(f, 1, read_via_mask_0x1, 0xc2, 0xc2)
+X(t, 1, read_via_mask_0x1, 0xc3, 0xc3)
+X(bin8, 2, read_size_field_u8, 0xc4, 0xc4)
+X(bin16, 3, read_size_field_u16, 0xc5, 0xc5)
+X(bin32, 5, read_size_field_u32, 0xc6, 0xc6)
+X(ext8, 3, read_size_field_u8, 0xc7, 0xc7)
+X(ext16, 4, read_size_field_u16, 0xc8, 0xc8)
+X(ext32, 6, read_size_field_u32, 0xc9, 0xc9)
+X(float32, 5, read_zero, 0xca, 0xca)
+X(float64, 9, read_zero, 0xcb, 0xcb)
+X(uint8, 2, read_size_field_u8, 0xcc, 0xcc)
+X(uint16, 3, read_size_field_u16, 0xcd, 0xcd)
+X(uint32, 5, read_size_field_u32, 0xce, 0xce)
+X(uint64, 9, read_size_field_u64, 0xcf, 0xcf)
+X(int8, 2, read_size_field_s8, 0xd0, 0xd0)
+X(int16, 3, read_size_field_s16, 0xd1, 0xd1)
+X(int32, 5, read_size_field_s32, 0xd2, 0xd2)
+X(int64, 9, read_size_field_s64, 0xd3, 0xd3)
+X(fixext1, 3, read_zero, 0xd4, 0xd4)
+X(fixext2, 4, read_zero, 0xd5, 0xd5)
+X(fixext4, 6, read_zero, 0xd6, 0xd6)
+X(fixext8, 10, read_zero, 0xd7, 0xd7)
+X(fixext16, 18, read_zero, 0xd8, 0xd8)
+X(str8, 2, read_size_field_u8, 0xd9, 0xd9)
+X(str16, 3, read_size_field_u16, 0xda, 0xda)
+X(str32, 5, read_size_field_u32, 0xdb, 0xdb)
+X(array16, 3, read_size_field_u16, 0xdc, 0xdc)
+X(array32, 5, read_size_field_u32, 0xdd, 0xdd)
+X(map16, 3, read_size_field_u16, 0xde, 0xde)
+X(map32, 5, read_size_field_u32, 0xdf, 0xdf)
diff --git a/libomptarget/plugins/amdgpu/impl/msgpack.h b/libomptarget/plugins/amdgpu/impl/msgpack.h
new file mode 100644
index 000000000..45f11d3ba
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/impl/msgpack.h
@@ -0,0 +1,275 @@
+#ifndef MSGPACK_H
+#define MSGPACK_H
+
+#include <functional>
+
+namespace msgpack {
+
+// The message pack format is dynamically typed, schema-less. Format is:
+// message: [type][header][payload]
+// where type is one byte, header length is a fixed length function of type
+// payload is zero to N bytes, with the length encoded in [type][header]
+
+// Scalar fields include boolean, signed integer, float, string etc
+// Composite types are sequences of messages
+// Array field is [header][element][element]...
+// Map field is [header][key][value][key][value]...
+
+// Multibyte integer fields are big endian encoded
+// The map key can be any message type
+// Maps may contain duplicate keys
+// Data is not uniquely encoded, e.g. integer "8" may be stored as one byte or
+// in as many as nine, as signed or unsigned. Implementation defined.
+// Similarly "foo" may embed the length in the type field or in multiple bytes
+
+// This parser is structured as an iterator over a sequence of bytes.
+// It calls a user provided function on each message in order to extract fields
+// The default implementation for each scalar type is to do nothing. For map or
+// arrays, the default implementation returns just after that message to support
+// iterating to the next message, but otherwise has no effect.
+
+struct byte_range {
+  const unsigned char *start;
+  const unsigned char *end;
+};
+
+const unsigned char *skip_next_message(const unsigned char *start,
+                                       const unsigned char *end);
+
+template <typename Derived> class functors_defaults {
+public:
+  void cb_string(size_t N, const unsigned char *str) {
+    derived().handle_string(N, str);
+  }
+  void cb_boolean(bool x) { derived().handle_boolean(x); }
+  void cb_signed(int64_t x) { derived().handle_signed(x); }
+  void cb_unsigned(uint64_t x) { derived().handle_unsigned(x); }
+  void cb_array_elements(byte_range bytes) {
+    derived().handle_array_elements(bytes);
+  }
+  void cb_map_elements(byte_range key, byte_range value) {
+    derived().handle_map_elements(key, value);
+  }
+  const unsigned char *cb_array(uint64_t N, byte_range bytes) {
+    return derived().handle_array(N, bytes);
+  }
+  const unsigned char *cb_map(uint64_t N, byte_range bytes) {
+    return derived().handle_map(N, bytes);
+  }
+
+private:
+  Derived &derived() { return *static_cast<Derived *>(this); }
+
+  // Default implementations for scalar ops are no-ops
+  void handle_string(size_t, const unsigned char *) {}
+  void handle_boolean(bool) {}
+  void handle_signed(int64_t) {}
+  void handle_unsigned(uint64_t) {}
+  void handle_array_elements(byte_range) {}
+  void handle_map_elements(byte_range, byte_range) {}
+
+  // Default implementation for sequences is to skip over the messages
+  const unsigned char *handle_array(uint64_t N, byte_range bytes) {
+    for (uint64_t i = 0; i < N; i++) {
+      const unsigned char *next = skip_next_message(bytes.start, bytes.end);
+      if (!next) {
+        return nullptr;
+      }
+      cb_array_elements(bytes);
+      bytes.start = next;
+    }
+    return bytes.start;
+  }
+  const unsigned char *handle_map(uint64_t N, byte_range bytes) {
+    for (uint64_t i = 0; i < N; i++) {
+      const unsigned char *start_key = bytes.start;
+      const unsigned char *end_key = skip_next_message(start_key, bytes.end);
+      if (!end_key) {
+        return nullptr;
+      }
+      const unsigned char *start_value = end_key;
+      const unsigned char *end_value =
+          skip_next_message(start_value, bytes.end);
+      if (!end_value) {
+        return nullptr;
+      }
+      cb_map_elements({start_key, end_key}, {start_value, end_value});
+      bytes.start = end_value;
+    }
+    return bytes.start;
+  }
+};
+
+typedef enum : uint8_t {
+#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) NAME,
+#include "msgpack.def"
+#undef X
+} type;
+
+[[noreturn]] void internal_error();
+type parse_type(unsigned char x);
+unsigned bytes_used_fixed(type ty);
+
+typedef uint64_t (*payload_info_t)(const unsigned char *);
+payload_info_t payload_info(msgpack::type ty);
+
+template <typename T, typename R> R bitcast(T x);
+
+template <typename F, msgpack::type ty>
+const unsigned char *handle_msgpack_given_type(byte_range bytes, F f) {
+  const unsigned char *start = bytes.start;
+  const unsigned char *end = bytes.end;
+  const uint64_t available = end - start;
+  assert(available != 0);
+  assert(ty == parse_type(*start));
+
+  const uint64_t bytes_used = bytes_used_fixed(ty);
+  if (available < bytes_used) {
+    return 0;
+  }
+  const uint64_t available_post_header = available - bytes_used;
+
+  const payload_info_t info = payload_info(ty);
+  const uint64_t N = info(start);
+
+  switch (ty) {
+  case msgpack::t:
+  case msgpack::f: {
+    // t is 0b11000010, f is 0b11000011, masked with 0x1
+    f.cb_boolean(N);
+    return start + bytes_used;
+  }
+
+  case msgpack::posfixint:
+  case msgpack::uint8:
+  case msgpack::uint16:
+  case msgpack::uint32:
+  case msgpack::uint64: {
+    f.cb_unsigned(N);
+    return start + bytes_used;
+  }
+
+  case msgpack::negfixint:
+  case msgpack::int8:
+  case msgpack::int16:
+  case msgpack::int32:
+  case msgpack::int64: {
+    f.cb_signed(bitcast<uint64_t, int64_t>(N));
+    return start + bytes_used;
+  }
+
+  case msgpack::fixstr:
+  case msgpack::str8:
+  case msgpack::str16:
+  case msgpack::str32: {
+    if (available_post_header < N) {
+      return 0;
+    } else {
+      f.cb_string(N, start + bytes_used);
+      return start + bytes_used + N;
+    }
+  }
+
+  case msgpack::fixarray:
+  case msgpack::array16:
+  case msgpack::array32: {
+    return f.cb_array(N, {start + bytes_used, end});
+  }
+
+  case msgpack::fixmap:
+  case msgpack::map16:
+  case msgpack::map32: {
+    return f.cb_map(N, {start + bytes_used, end});
+  }
+
+  case msgpack::nil:
+  case msgpack::bin8:
+  case msgpack::bin16:
+  case msgpack::bin32:
+  case msgpack::float32:
+  case msgpack::float64:
+  case msgpack::ext8:
+  case msgpack::ext16:
+  case msgpack::ext32:
+  case msgpack::fixext1:
+  case msgpack::fixext2:
+  case msgpack::fixext4:
+  case msgpack::fixext8:
+  case msgpack::fixext16:
+  case msgpack::never_used: {
+    if (available_post_header < N) {
+      return 0;
+    }
+    return start + bytes_used + N;
+  }
+  }
+  internal_error();
+}
+
+template <typename F>
+const unsigned char *handle_msgpack(byte_range bytes, F f) {
+  const unsigned char *start = bytes.start;
+  const unsigned char *end = bytes.end;
+  const uint64_t available = end - start;
+  if (available == 0) {
+    return 0;
+  }
+  const type ty = parse_type(*start);
+
+  switch (ty) {
+#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER)                                  \
+  case msgpack::NAME:                                                          \
+    return handle_msgpack_given_type<F, msgpack::NAME>(bytes, f);
+#include "msgpack.def"
+#undef X
+  }
+
+  internal_error();
+}
+
+bool message_is_string(byte_range bytes, const char *str);
+
+template <typename C> void foronly_string(byte_range bytes, C callback) {
+  struct inner : functors_defaults<inner> {
+    inner(C &cb) : cb(cb) {}
+    C &cb;
+    void handle_string(size_t N, const unsigned char *str) { cb(N, str); }
+  };
+  handle_msgpack<inner>(bytes, {callback});
+}
+
+template <typename C> void foronly_unsigned(byte_range bytes, C callback) {
+  struct inner : functors_defaults<inner> {
+    inner(C &cb) : cb(cb) {}
+    C &cb;
+    void handle_unsigned(uint64_t x) { cb(x); }
+  };
+  handle_msgpack<inner>(bytes, {callback});
+}
+
+template <typename C> void foreach_array(byte_range bytes, C callback) {
+  struct inner : functors_defaults<inner> {
+    inner(C &cb) : cb(cb) {}
+    C &cb;
+    void handle_array_elements(byte_range element) { cb(element); }
+  };
+  handle_msgpack<inner>(bytes, {callback});
+}
+
+template <typename C> void foreach_map(byte_range bytes, C callback) {
+  struct inner : functors_defaults<inner> {
+    inner(C &cb) : cb(cb) {}
+    C &cb;
+    void handle_map_elements(byte_range key, byte_range value) {
+      cb(key, value);
+    }
+  };
+  handle_msgpack<inner>(bytes, {callback});
+}
+
+// Crude approximation to json
+void dump(byte_range);
+
+} // namespace msgpack
+
+#endif
diff --git a/libomptarget/plugins/amdgpu/impl/rt.h b/libomptarget/plugins/amdgpu/impl/rt.h
new file mode 100644
index 000000000..757919eb3
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/impl/rt.h
@@ -0,0 +1,91 @@
+/*===--------------------------------------------------------------------------
+ *              ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#ifndef SRC_RUNTIME_INCLUDE_RT_H_
+#define SRC_RUNTIME_INCLUDE_RT_H_
+
+#include "atmi_runtime.h"
+#include "hsa.h"
+#include <cstdarg>
+#include <string>
+
+namespace core {
+
+#define DEFAULT_MAX_QUEUE_SIZE 4096
+#define DEFAULT_DEBUG_MODE 0
+class Environment {
+public:
+  Environment()
+      : max_queue_size_(DEFAULT_MAX_QUEUE_SIZE),
+        debug_mode_(DEFAULT_DEBUG_MODE) {
+    GetEnvAll();
+  }
+
+  void GetEnvAll();
+
+  int getMaxQueueSize() const { return max_queue_size_; }
+
+  // TODO(ashwinma): int may change to enum if we have more debug modes
+  int getDebugMode() const { return debug_mode_; }
+  // TODO(ashwinma): int may change to enum if we have more profile modes
+
+private:
+  std::string GetEnv(const char *name) {
+    char *env = getenv(name);
+    std::string ret;
+    if (env) {
+      ret = env;
+    }
+    return ret;
+  }
+
+  int max_queue_size_;
+  int debug_mode_;
+};
+
+class Runtime final {
+public:
+  static Runtime &getInstance() {
+    static Runtime instance;
+    return instance;
+  }
+
+  // init/finalize
+  static atmi_status_t Initialize();
+  static atmi_status_t Finalize();
+  // machine info
+  static atmi_machine_t *GetMachineInfo();
+  // modules
+  static atmi_status_t RegisterModuleFromMemory(
+      void *, size_t, atmi_place_t,
+      atmi_status_t (*on_deserialized_data)(void *data, size_t size,
+                                            void *cb_state),
+      void *cb_state);
+
+  // data
+  static atmi_status_t Memcpy(hsa_signal_t, void *, const void *, size_t);
+  static atmi_status_t Memfree(void *);
+  static atmi_status_t Malloc(void **, size_t, atmi_mem_place_t);
+
+  // environment variables
+  int getMaxQueueSize() const { return env_.getMaxQueueSize(); }
+
+  // TODO(ashwinma): int may change to enum if we have more debug modes
+  int getDebugMode() const { return env_.getDebugMode(); }
+
+protected:
+  Runtime() = default;
+  ~Runtime() = default;
+  Runtime(const Runtime &) = delete;
+  Runtime &operator=(const Runtime &) = delete;
+
+protected:
+  // variable to track environment variables
+  Environment env_;
+};
+
+} // namespace core
+
+#endif // SRC_RUNTIME_INCLUDE_RT_H_
diff --git a/libomptarget/plugins/amdgpu/impl/system.cpp b/libomptarget/plugins/amdgpu/impl/system.cpp
new file mode 100644
index 000000000..913dc91b2
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/impl/system.cpp
@@ -0,0 +1,1091 @@
+/*===--------------------------------------------------------------------------
+ *              ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#include <gelf.h>
+#include <libelf.h>
+
+#include <cassert>
+#include <cstdarg>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <set>
+#include <string>
+
+#include "internal.h"
+#include "machine.h"
+#include "rt.h"
+
+#include "msgpack.h"
+
+#define msgpackErrorCheck(msg, status)                                         \
+  if (status != 0) {                                                           \
+    printf("[%s:%d] %s failed\n", __FILE__, __LINE__, #msg);                   \
+    return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;                               \
+  } else {                                                                     \
+  }
+
+typedef unsigned char *address;
+/*
+ * Note descriptors.
+ */
+typedef struct {
+  uint32_t n_namesz; /* Length of note's name. */
+  uint32_t n_descsz; /* Length of note's value. */
+  uint32_t n_type;   /* Type of note. */
+  // then name
+  // then padding, optional
+  // then desc, at 4 byte alignment (not 8, despite being elf64)
+} Elf_Note;
+
+// The following include file and following structs/enums
+// have been replicated on a per-use basis below. For example,
+// llvm::AMDGPU::HSAMD::Kernel::Metadata has several fields,
+// but we may care only about kernargSegmentSize_ for now, so
+// we just include that field in our KernelMD implementation. We
+// chose this approach to replicate in order to avoid forcing
+// a dependency on LLVM_INCLUDE_DIR just to compile the runtime.
+// #include "llvm/Support/AMDGPUMetadata.h"
+// typedef llvm::AMDGPU::HSAMD::Metadata CodeObjectMD;
+// typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD;
+// typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD;
+// using llvm::AMDGPU::HSAMD::AccessQualifier;
+// using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
+// using llvm::AMDGPU::HSAMD::ValueKind;
+// using llvm::AMDGPU::HSAMD::ValueType;
+
+class KernelArgMD {
+public:
+  enum class ValueKind {
+    HiddenGlobalOffsetX,
+    HiddenGlobalOffsetY,
+    HiddenGlobalOffsetZ,
+    HiddenNone,
+    HiddenPrintfBuffer,
+    HiddenDefaultQueue,
+    HiddenCompletionAction,
+    HiddenMultiGridSyncArg,
+    HiddenHostcallBuffer,
+    Unknown
+  };
+
+  KernelArgMD()
+      : name_(std::string()), typeName_(std::string()), size_(0), offset_(0),
+        align_(0), valueKind_(ValueKind::Unknown) {}
+
+  // fields
+  std::string name_;
+  std::string typeName_;
+  uint32_t size_;
+  uint32_t offset_;
+  uint32_t align_;
+  ValueKind valueKind_;
+};
+
+class KernelMD {
+public:
+  KernelMD() : kernargSegmentSize_(0ull) {}
+
+  // fields
+  uint64_t kernargSegmentSize_;
+};
+
+static const std::map<std::string, KernelArgMD::ValueKind> ArgValueKind = {
+    //    Including only those fields that are relevant to the runtime.
+    //    {"ByValue", KernelArgMD::ValueKind::ByValue},
+    //    {"GlobalBuffer", KernelArgMD::ValueKind::GlobalBuffer},
+    //    {"DynamicSharedPointer",
+    //    KernelArgMD::ValueKind::DynamicSharedPointer},
+    //    {"Sampler", KernelArgMD::ValueKind::Sampler},
+    //    {"Image", KernelArgMD::ValueKind::Image},
+    //    {"Pipe", KernelArgMD::ValueKind::Pipe},
+    //    {"Queue", KernelArgMD::ValueKind::Queue},
+    {"HiddenGlobalOffsetX", KernelArgMD::ValueKind::HiddenGlobalOffsetX},
+    {"HiddenGlobalOffsetY", KernelArgMD::ValueKind::HiddenGlobalOffsetY},
+    {"HiddenGlobalOffsetZ", KernelArgMD::ValueKind::HiddenGlobalOffsetZ},
+    {"HiddenNone", KernelArgMD::ValueKind::HiddenNone},
+    {"HiddenPrintfBuffer", KernelArgMD::ValueKind::HiddenPrintfBuffer},
+    {"HiddenDefaultQueue", KernelArgMD::ValueKind::HiddenDefaultQueue},
+    {"HiddenCompletionAction", KernelArgMD::ValueKind::HiddenCompletionAction},
+    {"HiddenMultiGridSyncArg", KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
+    {"HiddenHostcallBuffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
+    // v3
+    //    {"by_value", KernelArgMD::ValueKind::ByValue},
+    //    {"global_buffer", KernelArgMD::ValueKind::GlobalBuffer},
+    //    {"dynamic_shared_pointer",
+    //    KernelArgMD::ValueKind::DynamicSharedPointer},
+    //    {"sampler", KernelArgMD::ValueKind::Sampler},
+    //    {"image", KernelArgMD::ValueKind::Image},
+    //    {"pipe", KernelArgMD::ValueKind::Pipe},
+    //    {"queue", KernelArgMD::ValueKind::Queue},
+    {"hidden_global_offset_x", KernelArgMD::ValueKind::HiddenGlobalOffsetX},
+    {"hidden_global_offset_y", KernelArgMD::ValueKind::HiddenGlobalOffsetY},
+    {"hidden_global_offset_z", KernelArgMD::ValueKind::HiddenGlobalOffsetZ},
+    {"hidden_none", KernelArgMD::ValueKind::HiddenNone},
+    {"hidden_printf_buffer", KernelArgMD::ValueKind::HiddenPrintfBuffer},
+    {"hidden_default_queue", KernelArgMD::ValueKind::HiddenDefaultQueue},
+    {"hidden_completion_action",
+     KernelArgMD::ValueKind::HiddenCompletionAction},
+    {"hidden_multigrid_sync_arg",
+     KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
+    {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
+};
+
+// public variables -- TODO(ashwinma) move these to a runtime object?
+atmi_machine_t g_atmi_machine;
+ATLMachine g_atl_machine;
+
+hsa_region_t atl_gpu_kernarg_region;
+std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools;
+hsa_region_t atl_cpu_kernarg_region;
+
+static std::vector<hsa_executable_t> g_executables;
+
+std::map<std::string, std::string> KernelNameMap;
+std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
+std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
+
+bool g_atmi_initialized = false;
+bool g_atmi_hostcall_required = false;
+
+struct timespec context_init_time;
+int context_init_time_init = 0;
+
+/*
+   atlc is all internal global values.
+   The structure atl_context_t is defined in atl_internal.h
+   Most references will use the global structure prefix atlc.
+   However the pointer value atlc_p-> is equivalent to atlc.
+
+*/
+
+atl_context_t atlc = {.struct_initialized = false};
+atl_context_t *atlc_p = NULL;
+
+namespace core {
+/* Machine Info */
+atmi_machine_t *Runtime::GetMachineInfo() {
+  if (!atlc.g_hsa_initialized)
+    return NULL;
+  return &g_atmi_machine;
+}
+
+void atl_set_atmi_initialized() {
+  // FIXME: thread safe? locks?
+  g_atmi_initialized = true;
+}
+
+void atl_reset_atmi_initialized() {
+  // FIXME: thread safe? locks?
+  g_atmi_initialized = false;
+}
+
+bool atl_is_atmi_initialized() { return g_atmi_initialized; }
+
+void allow_access_to_all_gpu_agents(void *ptr) {
+  hsa_status_t err;
+  std::vector<ATLGPUProcessor> &gpu_procs =
+      g_atl_machine.processors<ATLGPUProcessor>();
+  std::vector<hsa_agent_t> agents;
+  for (uint32_t i = 0; i < gpu_procs.size(); i++) {
+    agents.push_back(gpu_procs[i].agent());
+  }
+  err = hsa_amd_agents_allow_access(agents.size(), &agents[0], NULL, ptr);
+  ErrorCheck(Allow agents ptr access, err);
+}
+
+atmi_status_t Runtime::Initialize() {
+  atmi_devtype_t devtype = ATMI_DEVTYPE_GPU;
+  if (atl_is_atmi_initialized())
+    return ATMI_STATUS_SUCCESS;
+
+  if (devtype == ATMI_DEVTYPE_ALL || devtype & ATMI_DEVTYPE_GPU) {
+    ATMIErrorCheck(GPU context init, atl_init_gpu_context());
+  }
+
+  atl_set_atmi_initialized();
+  return ATMI_STATUS_SUCCESS;
+}
+
+atmi_status_t Runtime::Finalize() {
+  // TODO(ashwinma): Finalize all processors, queues, signals, kernarg memory
+  // regions
+  hsa_status_t err;
+
+  for (uint32_t i = 0; i < g_executables.size(); i++) {
+    err = hsa_executable_destroy(g_executables[i]);
+    ErrorCheck(Destroying executable, err);
+  }
+
+  for (uint32_t i = 0; i < SymbolInfoTable.size(); i++) {
+    SymbolInfoTable[i].clear();
+  }
+  SymbolInfoTable.clear();
+  for (uint32_t i = 0; i < KernelInfoTable.size(); i++) {
+    KernelInfoTable[i].clear();
+  }
+  KernelInfoTable.clear();
+
+  atl_reset_atmi_initialized();
+  err = hsa_shut_down();
+  ErrorCheck(Shutting down HSA, err);
+
+  return ATMI_STATUS_SUCCESS;
+}
+
+void atmi_init_context_structs() {
+  atlc_p = &atlc;
+  atlc.struct_initialized = true; /* This only gets called one time */
+  atlc.g_hsa_initialized = false;
+  atlc.g_gpu_initialized = false;
+  atlc.g_tasks_initialized = false;
+}
+
+// Implement memory_pool iteration function
+static hsa_status_t get_memory_pool_info(hsa_amd_memory_pool_t memory_pool,
+                                         void *data) {
+  ATLProcessor *proc = reinterpret_cast<ATLProcessor *>(data);
+  hsa_status_t err = HSA_STATUS_SUCCESS;
+  // Check if the memory_pool is allowed to allocate, i.e. do not return group
+  // memory
+  bool alloc_allowed = false;
+  err = hsa_amd_memory_pool_get_info(
+      memory_pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
+      &alloc_allowed);
+  ErrorCheck(Alloc allowed in memory pool check, err);
+  if (alloc_allowed) {
+    uint32_t global_flag = 0;
+    err = hsa_amd_memory_pool_get_info(
+        memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
+    ErrorCheck(Get memory pool info, err);
+    if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & global_flag) {
+      ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_FINE_GRAINED);
+      proc->addMemory(new_mem);
+      if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & global_flag) {
+        DEBUG_PRINT("GPU kernel args pool handle: %lu\n", memory_pool.handle);
+        atl_gpu_kernarg_pools.push_back(memory_pool);
+      }
+    } else {
+      ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_COARSE_GRAINED);
+      proc->addMemory(new_mem);
+    }
+  }
+
+  return err;
+}
+
+static hsa_status_t get_agent_info(hsa_agent_t agent, void *data) {
+  hsa_status_t err = HSA_STATUS_SUCCESS;
+  hsa_device_type_t device_type;
+  err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
+  ErrorCheck(Get device type info, err);
+  switch (device_type) {
+  case HSA_DEVICE_TYPE_CPU: {
+    ;
+    ATLCPUProcessor new_proc(agent);
+    err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info,
+                                             &new_proc);
+    ErrorCheck(Iterate all memory pools, err);
+    g_atl_machine.addProcessor(new_proc);
+  } break;
+  case HSA_DEVICE_TYPE_GPU: {
+    ;
+    hsa_profile_t profile;
+    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &profile);
+    ErrorCheck(Query the agent profile, err);
+    atmi_devtype_t gpu_type;
+    gpu_type =
+        (profile == HSA_PROFILE_FULL) ? ATMI_DEVTYPE_iGPU : ATMI_DEVTYPE_dGPU;
+    ATLGPUProcessor new_proc(agent, gpu_type);
+    err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info,
+                                             &new_proc);
+    ErrorCheck(Iterate all memory pools, err);
+    g_atl_machine.addProcessor(new_proc);
+  } break;
+  case HSA_DEVICE_TYPE_DSP: {
+    err = HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
+  } break;
+  }
+
+  return err;
+}
+
+hsa_status_t get_fine_grained_region(hsa_region_t region, void *data) {
+  hsa_region_segment_t segment;
+  hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
+  if (segment != HSA_REGION_SEGMENT_GLOBAL) {
+    return HSA_STATUS_SUCCESS;
+  }
+  hsa_region_global_flag_t flags;
+  hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
+  if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) {
+    hsa_region_t *ret = reinterpret_cast<hsa_region_t *>(data);
+    *ret = region;
+    return HSA_STATUS_INFO_BREAK;
+  }
+  return HSA_STATUS_SUCCESS;
+}
+
+/* Determines if a memory region can be used for kernarg allocations.  */
+static hsa_status_t get_kernarg_memory_region(hsa_region_t region, void *data) {
+  hsa_region_segment_t segment;
+  hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
+  if (HSA_REGION_SEGMENT_GLOBAL != segment) {
+    return HSA_STATUS_SUCCESS;
+  }
+
+  hsa_region_global_flag_t flags;
+  hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
+  if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) {
+    hsa_region_t *ret = reinterpret_cast<hsa_region_t *>(data);
+    *ret = region;
+    return HSA_STATUS_INFO_BREAK;
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+static hsa_status_t init_compute_and_memory() {
+  hsa_status_t err;
+
+  /* Iterate over the agents and pick the gpu agent */
+  err = hsa_iterate_agents(get_agent_info, NULL);
+  if (err == HSA_STATUS_INFO_BREAK) {
+    err = HSA_STATUS_SUCCESS;
+  }
+  ErrorCheck(Getting a gpu agent, err);
+  if (err != HSA_STATUS_SUCCESS)
+    return err;
+
+  /* Init all devices or individual device types? */
+  std::vector<ATLCPUProcessor> &cpu_procs =
+      g_atl_machine.processors<ATLCPUProcessor>();
+  std::vector<ATLGPUProcessor> &gpu_procs =
+      g_atl_machine.processors<ATLGPUProcessor>();
+  /* For CPU memory pools, add other devices that can access them directly
+   * or indirectly */
+  for (auto &cpu_proc : cpu_procs) {
+    for (auto &cpu_mem : cpu_proc.memories()) {
+      hsa_amd_memory_pool_t pool = cpu_mem.memory();
+      for (auto &gpu_proc : gpu_procs) {
+        hsa_agent_t agent = gpu_proc.agent();
+        hsa_amd_memory_pool_access_t access;
+        hsa_amd_agent_memory_pool_get_info(
+            agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
+        if (access != 0) {
+          // this means not NEVER, but could be YES or NO
+          // add this memory pool to the proc
+          gpu_proc.addMemory(cpu_mem);
+        }
+      }
+    }
+  }
+
+  /* FIXME: are the below combinations of procs and memory pools needed?
+   * all to all compare procs with their memory pools and add those memory
+   * pools that are accessible by the target procs */
+  for (auto &gpu_proc : gpu_procs) {
+    for (auto &gpu_mem : gpu_proc.memories()) {
+      hsa_amd_memory_pool_t pool = gpu_mem.memory();
+      for (auto &cpu_proc : cpu_procs) {
+        hsa_agent_t agent = cpu_proc.agent();
+        hsa_amd_memory_pool_access_t access;
+        hsa_amd_agent_memory_pool_get_info(
+            agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
+        if (access != 0) {
+          // this means not NEVER, but could be YES or NO
+          // add this memory pool to the proc
+          cpu_proc.addMemory(gpu_mem);
+        }
+      }
+    }
+  }
+
+  g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_CPU] = cpu_procs.size();
+  g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_GPU] = gpu_procs.size();
+
+  size_t num_procs = cpu_procs.size() + gpu_procs.size();
+  // g_atmi_machine.devices = (atmi_device_t *)malloc(num_procs *
+  // sizeof(atmi_device_t));
+  atmi_device_t *all_devices = reinterpret_cast<atmi_device_t *>(
+      malloc(num_procs * sizeof(atmi_device_t)));
+  int num_iGPUs = 0;
+  int num_dGPUs = 0;
+  for (uint32_t i = 0; i < gpu_procs.size(); i++) {
+    if (gpu_procs[i].type() == ATMI_DEVTYPE_iGPU)
+      num_iGPUs++;
+    else
+      num_dGPUs++;
+  }
+  assert(num_iGPUs + num_dGPUs == gpu_procs.size() &&
+         "Number of dGPUs and iGPUs do not add up");
+  DEBUG_PRINT("CPU Agents: %lu\n", cpu_procs.size());
+  DEBUG_PRINT("iGPU Agents: %d\n", num_iGPUs);
+  DEBUG_PRINT("dGPU Agents: %d\n", num_dGPUs);
+  DEBUG_PRINT("GPU Agents: %lu\n", gpu_procs.size());
+
+  g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_iGPU] = num_iGPUs;
+  g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_dGPU] = num_dGPUs;
+
+  int cpus_begin = 0;
+  int cpus_end = cpu_procs.size();
+  int gpus_begin = cpu_procs.size();
+  int gpus_end = cpu_procs.size() + gpu_procs.size();
+  g_atmi_machine.devices_by_type[ATMI_DEVTYPE_CPU] = &all_devices[cpus_begin];
+  g_atmi_machine.devices_by_type[ATMI_DEVTYPE_GPU] = &all_devices[gpus_begin];
+  g_atmi_machine.devices_by_type[ATMI_DEVTYPE_iGPU] = &all_devices[gpus_begin];
+  g_atmi_machine.devices_by_type[ATMI_DEVTYPE_dGPU] = &all_devices[gpus_begin];
+  int proc_index = 0;
+  for (int i = cpus_begin; i < cpus_end; i++) {
+    all_devices[i].type = cpu_procs[proc_index].type();
+
+    std::vector<ATLMemory> memories = cpu_procs[proc_index].memories();
+    int fine_memories_size = 0;
+    int coarse_memories_size = 0;
+    DEBUG_PRINT("CPU memory types:\t");
+    for (auto &memory : memories) {
+      atmi_memtype_t type = memory.type();
+      if (type == ATMI_MEMTYPE_FINE_GRAINED) {
+        fine_memories_size++;
+        DEBUG_PRINT("Fine\t");
+      } else {
+        coarse_memories_size++;
+        DEBUG_PRINT("Coarse\t");
+      }
+    }
+    DEBUG_PRINT("\nFine Memories : %d", fine_memories_size);
+    DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size);
+    proc_index++;
+  }
+  proc_index = 0;
+  for (int i = gpus_begin; i < gpus_end; i++) {
+    all_devices[i].type = gpu_procs[proc_index].type();
+
+    std::vector<ATLMemory> memories = gpu_procs[proc_index].memories();
+    int fine_memories_size = 0;
+    int coarse_memories_size = 0;
+    DEBUG_PRINT("GPU memory types:\t");
+    for (auto &memory : memories) {
+      atmi_memtype_t type = memory.type();
+      if (type == ATMI_MEMTYPE_FINE_GRAINED) {
+        fine_memories_size++;
+        DEBUG_PRINT("Fine\t");
+      } else {
+        coarse_memories_size++;
+        DEBUG_PRINT("Coarse\t");
+      }
+    }
+    DEBUG_PRINT("\nFine Memories : %d", fine_memories_size);
+    DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size);
+    proc_index++;
+  }
+  proc_index = 0;
+  atl_cpu_kernarg_region.handle = (uint64_t)-1;
+  if (cpu_procs.size() > 0) {
+    err = hsa_agent_iterate_regions(
+        cpu_procs[0].agent(), get_fine_grained_region, &atl_cpu_kernarg_region);
+    if (err == HSA_STATUS_INFO_BREAK) {
+      err = HSA_STATUS_SUCCESS;
+    }
+    err = (atl_cpu_kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR
+                                                          : HSA_STATUS_SUCCESS;
+    ErrorCheck(Finding a CPU kernarg memory region handle, err);
+  }
+  /* Find a memory region that supports kernel arguments.  */
+  atl_gpu_kernarg_region.handle = (uint64_t)-1;
+  if (gpu_procs.size() > 0) {
+    hsa_agent_iterate_regions(gpu_procs[0].agent(), get_kernarg_memory_region,
+                              &atl_gpu_kernarg_region);
+    err = (atl_gpu_kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR
+                                                          : HSA_STATUS_SUCCESS;
+    ErrorCheck(Finding a kernarg memory region, err);
+  }
+  if (num_procs > 0)
+    return HSA_STATUS_SUCCESS;
+  else
+    return HSA_STATUS_ERROR_NOT_INITIALIZED;
+}
+
+hsa_status_t init_hsa() {
+  if (atlc.g_hsa_initialized == false) {
+    DEBUG_PRINT("Initializing HSA...");
+    hsa_status_t err = hsa_init();
+    ErrorCheck(Initializing the hsa runtime, err);
+    if (err != HSA_STATUS_SUCCESS)
+      return err;
+
+    err = init_compute_and_memory();
+    if (err != HSA_STATUS_SUCCESS)
+      return err;
+    ErrorCheck(After initializing compute and memory, err);
+
+    int gpu_count = g_atl_machine.processorCount<ATLGPUProcessor>();
+    KernelInfoTable.resize(gpu_count);
+    SymbolInfoTable.resize(gpu_count);
+    for (uint32_t i = 0; i < SymbolInfoTable.size(); i++)
+      SymbolInfoTable[i].clear();
+    for (uint32_t i = 0; i < KernelInfoTable.size(); i++)
+      KernelInfoTable[i].clear();
+    atlc.g_hsa_initialized = true;
+    DEBUG_PRINT("done\n");
+  }
+  return HSA_STATUS_SUCCESS;
+}
+
+void init_tasks() {
+  if (atlc.g_tasks_initialized != false)
+    return;
+  std::vector<hsa_agent_t> gpu_agents;
+  int gpu_count = g_atl_machine.processorCount<ATLGPUProcessor>();
+  for (int gpu = 0; gpu < gpu_count; gpu++) {
+    atmi_place_t place = ATMI_PLACE_GPU(0, gpu);
+    ATLGPUProcessor &proc = get_processor<ATLGPUProcessor>(place);
+    gpu_agents.push_back(proc.agent());
+  }
+  atlc.g_tasks_initialized = true;
+}
+
+hsa_status_t callbackEvent(const hsa_amd_event_t *event, void *data) {
+#if (ROCM_VERSION_MAJOR >= 3) ||                                               \
+    (ROCM_VERSION_MAJOR >= 2 && ROCM_VERSION_MINOR >= 3)
+  if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) {
+#else
+  if (event->event_type == GPU_MEMORY_FAULT_EVENT) {
+#endif
+    hsa_amd_gpu_memory_fault_info_t memory_fault = event->memory_fault;
+    // memory_fault.agent
+    // memory_fault.virtual_address
+    // memory_fault.fault_reason_mask
+    // fprintf("[GPU Error at %p: Reason is ", memory_fault.virtual_address);
+    std::stringstream stream;
+    stream << std::hex << (uintptr_t)memory_fault.virtual_address;
+    std::string addr("0x" + stream.str());
+
+    std::string err_string = "[GPU Memory Error] Addr: " + addr;
+    err_string += " Reason: ";
+    if (!(memory_fault.fault_reason_mask & 0x00111111)) {
+      err_string += "No Idea! ";
+    } else {
+      if (memory_fault.fault_reason_mask & 0x00000001)
+        err_string += "Page not present or supervisor privilege. ";
+      if (memory_fault.fault_reason_mask & 0x00000010)
+        err_string += "Write access to a read-only page. ";
+      if (memory_fault.fault_reason_mask & 0x00000100)
+        err_string += "Execute access to a page marked NX. ";
+      if (memory_fault.fault_reason_mask & 0x00001000)
+        err_string += "Host access only. ";
+      if (memory_fault.fault_reason_mask & 0x00010000)
+        err_string += "ECC failure (if supported by HW). ";
+      if (memory_fault.fault_reason_mask & 0x00100000)
+        err_string += "Can't determine the exact fault address. ";
+    }
+    fprintf(stderr, "%s\n", err_string.c_str());
+    return HSA_STATUS_ERROR;
+  }
+  return HSA_STATUS_SUCCESS;
+}
+
+atmi_status_t atl_init_gpu_context() {
+  if (atlc.struct_initialized == false)
+    atmi_init_context_structs();
+  if (atlc.g_gpu_initialized != false)
+    return ATMI_STATUS_SUCCESS;
+
+  hsa_status_t err;
+  err = init_hsa();
+  if (err != HSA_STATUS_SUCCESS)
+    return ATMI_STATUS_ERROR;
+
+  if (context_init_time_init == 0) {
+    clock_gettime(CLOCK_MONOTONIC_RAW, &context_init_time);
+    context_init_time_init = 1;
+  }
+
+  err = hsa_amd_register_system_event_handler(callbackEvent, NULL);
+    ErrorCheck(Registering the system for memory faults, err);
+
+    init_tasks();
+    atlc.g_gpu_initialized = true;
+    return ATMI_STATUS_SUCCESS;
+}
+
+bool isImplicit(KernelArgMD::ValueKind value_kind) {
+  switch (value_kind) {
+  case KernelArgMD::ValueKind::HiddenGlobalOffsetX:
+  case KernelArgMD::ValueKind::HiddenGlobalOffsetY:
+  case KernelArgMD::ValueKind::HiddenGlobalOffsetZ:
+  case KernelArgMD::ValueKind::HiddenNone:
+  case KernelArgMD::ValueKind::HiddenPrintfBuffer:
+  case KernelArgMD::ValueKind::HiddenDefaultQueue:
+  case KernelArgMD::ValueKind::HiddenCompletionAction:
+  case KernelArgMD::ValueKind::HiddenMultiGridSyncArg:
+  case KernelArgMD::ValueKind::HiddenHostcallBuffer:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static std::pair<unsigned char *, unsigned char *>
+find_metadata(void *binary, size_t binSize) {
+  std::pair<unsigned char *, unsigned char *> failure = {nullptr, nullptr};
+
+  Elf *e = elf_memory(static_cast<char *>(binary), binSize);
+  if (elf_kind(e) != ELF_K_ELF) {
+    return failure;
+  }
+
+  size_t numpHdrs;
+  if (elf_getphdrnum(e, &numpHdrs) != 0) {
+    return failure;
+  }
+
+  for (size_t i = 0; i < numpHdrs; ++i) {
+    GElf_Phdr pHdr;
+    if (gelf_getphdr(e, i, &pHdr) != &pHdr) {
+      continue;
+    }
+    // Look for the runtime metadata note
+    if (pHdr.p_type == PT_NOTE && pHdr.p_align >= sizeof(int)) {
+      // Iterate over the notes in this segment
+      address ptr = (address)binary + pHdr.p_offset;
+      address segmentEnd = ptr + pHdr.p_filesz;
+
+      while (ptr < segmentEnd) {
+        Elf_Note *note = reinterpret_cast<Elf_Note *>(ptr);
+        address name = (address)&note[1];
+
+        if (note->n_type == 7 || note->n_type == 8) {
+          return failure;
+        } else if (note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA */ &&
+                   note->n_namesz == sizeof "AMD" &&
+                   !memcmp(name, "AMD", note->n_namesz)) {
+          // code object v2 uses yaml metadata, no longer supported
+          return failure;
+        } else if (note->n_type == 32 /* NT_AMDGPU_METADATA */ &&
+                   note->n_namesz == sizeof "AMDGPU" &&
+                   !memcmp(name, "AMDGPU", note->n_namesz)) {
+
+          // n_descsz = 485
+          // value is padded to 4 byte alignment, may want to move end up to
+          // match
+          size_t offset = sizeof(uint32_t) * 3 /* fields */
+                          + sizeof("AMDGPU")   /* name */
+                          + 1 /* padding to 4 byte alignment */;
+
+          // Including the trailing padding means both pointers are 4 bytes
+          // aligned, which may be useful later.
+          unsigned char *metadata_start = (unsigned char *)ptr + offset;
+          unsigned char *metadata_end =
+              metadata_start + core::alignUp(note->n_descsz, 4);
+          return {metadata_start, metadata_end};
+        }
+        ptr += sizeof(*note) + core::alignUp(note->n_namesz, sizeof(int)) +
+               core::alignUp(note->n_descsz, sizeof(int));
+      }
+    }
+  }
+
+  return failure;
+}
+
+namespace {
+int map_lookup_array(msgpack::byte_range message, const char *needle,
+                     msgpack::byte_range *res, uint64_t *size) {
+  unsigned count = 0;
+  struct s : msgpack::functors_defaults<s> {
+    s(unsigned &count, uint64_t *size) : count(count), size(size) {}
+    unsigned &count;
+    uint64_t *size;
+    const unsigned char *handle_array(uint64_t N, msgpack::byte_range bytes) {
+      count++;
+      *size = N;
+      return bytes.end;
+    }
+  };
+
+  msgpack::foreach_map(message,
+                       [&](msgpack::byte_range key, msgpack::byte_range value) {
+                         if (msgpack::message_is_string(key, needle)) {
+                           // If the message is an array, record number of
+                           // elements in *size
+                           msgpack::handle_msgpack<s>(value, {count, size});
+                           // return the whole array
+                           *res = value;
+                         }
+                       });
+  // Only claim success if exactly one key/array pair matched
+  return count != 1;
+}
+
+int map_lookup_string(msgpack::byte_range message, const char *needle,
+                      std::string *res) {
+  unsigned count = 0;
+  struct s : public msgpack::functors_defaults<s> {
+    s(unsigned &count, std::string *res) : count(count), res(res) {}
+    unsigned &count;
+    std::string *res;
+    void handle_string(size_t N, const unsigned char *str) {
+      count++;
+      *res = std::string(str, str + N);
+    }
+  };
+  msgpack::foreach_map(message,
+                       [&](msgpack::byte_range key, msgpack::byte_range value) {
+                         if (msgpack::message_is_string(key, needle)) {
+                           msgpack::handle_msgpack<s>(value, {count, res});
+                         }
+                       });
+  return count != 1;
+}
+
+int map_lookup_uint64_t(msgpack::byte_range message, const char *needle,
+                        uint64_t *res) {
+  unsigned count = 0;
+  msgpack::foreach_map(message,
+                       [&](msgpack::byte_range key, msgpack::byte_range value) {
+                         if (msgpack::message_is_string(key, needle)) {
+                           msgpack::foronly_unsigned(value, [&](uint64_t x) {
+                             count++;
+                             *res = x;
+                           });
+                         }
+                       });
+  return count != 1;
+}
+
+int array_lookup_element(msgpack::byte_range message, uint64_t elt,
+                         msgpack::byte_range *res) {
+  int rc = 1;
+  uint64_t i = 0;
+  msgpack::foreach_array(message, [&](msgpack::byte_range value) {
+    if (i == elt) {
+      *res = value;
+      rc = 0;
+    }
+    i++;
+  });
+  return rc;
+}
+
+int populate_kernelArgMD(msgpack::byte_range args_element,
+                         KernelArgMD *kernelarg) {
+  using namespace msgpack;
+  int error = 0;
+  foreach_map(args_element, [&](byte_range key, byte_range value) -> void {
+    if (message_is_string(key, ".name")) {
+      foronly_string(value, [&](size_t N, const unsigned char *str) {
+        kernelarg->name_ = std::string(str, str + N);
+      });
+    } else if (message_is_string(key, ".type_name")) {
+      foronly_string(value, [&](size_t N, const unsigned char *str) {
+        kernelarg->typeName_ = std::string(str, str + N);
+      });
+    } else if (message_is_string(key, ".size")) {
+      foronly_unsigned(value, [&](uint64_t x) { kernelarg->size_ = x; });
+    } else if (message_is_string(key, ".offset")) {
+      foronly_unsigned(value, [&](uint64_t x) { kernelarg->offset_ = x; });
+    } else if (message_is_string(key, ".value_kind")) {
+      foronly_string(value, [&](size_t N, const unsigned char *str) {
+        std::string s = std::string(str, str + N);
+        auto itValueKind = ArgValueKind.find(s);
+        if (itValueKind != ArgValueKind.end()) {
+          kernelarg->valueKind_ = itValueKind->second;
+        }
+      });
+    }
+  });
+  return error;
+}
+} // namespace
+
+static hsa_status_t get_code_object_custom_metadata(void *binary,
+                                                    size_t binSize, int gpu) {
+  // parse code object with different keys from v2
+  // also, the kernel name is not the same as the symbol name -- so a
+  // symbol->name map is needed
+
+  std::pair<unsigned char *, unsigned char *> metadata =
+      find_metadata(binary, binSize);
+  if (!metadata.first) {
+    return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
+  }
+
+  uint64_t kernelsSize = 0;
+  int msgpack_errors = 0;
+  msgpack::byte_range kernel_array;
+  msgpack_errors =
+      map_lookup_array({metadata.first, metadata.second}, "amdhsa.kernels",
+                       &kernel_array, &kernelsSize);
+  msgpackErrorCheck(kernels lookup in program metadata, msgpack_errors);
+
+  for (size_t i = 0; i < kernelsSize; i++) {
+    assert(msgpack_errors == 0);
+    std::string kernelName;
+    std::string languageName;
+    std::string symbolName;
+
+    msgpack::byte_range element;
+    msgpack_errors += array_lookup_element(kernel_array, i, &element);
+    msgpackErrorCheck(element lookup in kernel metadata, msgpack_errors);
+
+    msgpack_errors += map_lookup_string(element, ".name", &kernelName);
+    msgpack_errors += map_lookup_string(element, ".language", &languageName);
+    msgpack_errors += map_lookup_string(element, ".symbol", &symbolName);
+    msgpackErrorCheck(strings lookup in kernel metadata, msgpack_errors);
+
+    atl_kernel_info_t info = {0, 0, 0, 0, 0, {}, {}, {}};
+    size_t kernel_explicit_args_size = 0;
+    uint64_t kernel_segment_size;
+    msgpack_errors += map_lookup_uint64_t(element, ".kernarg_segment_size",
+                                          &kernel_segment_size);
+    msgpackErrorCheck(kernarg segment size metadata lookup in kernel metadata,
+                      msgpack_errors);
+
+    // create a map from symbol to name
+    DEBUG_PRINT("Kernel symbol %s; Name: %s; Size: %lu\n", symbolName.c_str(),
+                kernelName.c_str(), kernel_segment_size);
+    KernelNameMap[symbolName] = kernelName;
+
+    bool hasHiddenArgs = false;
+    if (kernel_segment_size > 0) {
+      uint64_t argsSize;
+      size_t offset = 0;
+
+      msgpack::byte_range args_array;
+      msgpack_errors +=
+          map_lookup_array(element, ".args", &args_array, &argsSize);
+      msgpackErrorCheck(kernel args metadata lookup in kernel metadata,
+                        msgpack_errors);
+
+      info.num_args = argsSize;
+
+      for (size_t i = 0; i < argsSize; ++i) {
+        KernelArgMD lcArg;
+
+        msgpack::byte_range args_element;
+        msgpack_errors += array_lookup_element(args_array, i, &args_element);
+        msgpackErrorCheck(iterate args map in kernel args metadata,
+                          msgpack_errors);
+
+        msgpack_errors += populate_kernelArgMD(args_element, &lcArg);
+        msgpackErrorCheck(iterate args map in kernel args metadata,
+                          msgpack_errors);
+
+        // TODO(ashwinma): should the below population actions be done only for
+        // non-implicit args?
+        // populate info with sizes and offsets
+        info.arg_sizes.push_back(lcArg.size_);
+        // v3 has offset field and not align field
+        size_t new_offset = lcArg.offset_;
+        size_t padding = new_offset - offset;
+        offset = new_offset;
+        info.arg_offsets.push_back(lcArg.offset_);
+        DEBUG_PRINT("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(),
+                    lcArg.size_, lcArg.offset_);
+        offset += lcArg.size_;
+
+        // check if the arg is a hidden/implicit arg
+        // this logic assumes that all hidden args are 8-byte aligned
+        if (!isImplicit(lcArg.valueKind_)) {
+          kernel_explicit_args_size += lcArg.size_;
+        } else {
+          hasHiddenArgs = true;
+        }
+        kernel_explicit_args_size += padding;
+      }
+    }
+
+    // add size of implicit args, e.g.: offset x, y and z and pipe pointer, but
+    // in ATMI, do not count the compiler set implicit args, but set your own
+    // implicit args by discounting the compiler set implicit args
+    info.kernel_segment_size =
+        (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size) +
+        sizeof(atmi_implicit_args_t);
+    DEBUG_PRINT("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(),
+                kernel_segment_size, info.kernel_segment_size);
+
+    // kernel received, now add it to the kernel info table
+    KernelInfoTable[gpu][kernelName] = info;
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+static hsa_status_t populate_InfoTables(hsa_executable_t executable,
+                                        hsa_executable_symbol_t symbol,
+                                        void *data) {
+  int gpu = *static_cast<int *>(data);
+  hsa_symbol_kind_t type;
+
+  uint32_t name_length;
+  hsa_status_t err;
+  err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE,
+                                       &type);
+  ErrorCheck(Symbol info extraction, err);
+  DEBUG_PRINT("Exec Symbol type: %d\n", type);
+  if (type == HSA_SYMBOL_KIND_KERNEL) {
+    err = hsa_executable_symbol_get_info(
+        symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
+    ErrorCheck(Symbol info extraction, err);
+    char *name = reinterpret_cast<char *>(malloc(name_length + 1));
+    err = hsa_executable_symbol_get_info(symbol,
+                                         HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
+    ErrorCheck(Symbol info extraction, err);
+    name[name_length] = 0;
+
+    if (KernelNameMap.find(std::string(name)) == KernelNameMap.end()) {
+      // did not find kernel name in the kernel map; this can happen only
+      // if the ROCr API for getting symbol info (name) is different from
+      // the comgr method of getting symbol info
+      ErrorCheck(Invalid kernel name, HSA_STATUS_ERROR_INVALID_CODE_OBJECT);
+    }
+    atl_kernel_info_t info;
+    std::string kernelName = KernelNameMap[std::string(name)];
+    // by now, the kernel info table should already have an entry
+    // because the non-ROCr custom code object parsing is called before
+    // iterating over the code object symbols using ROCr
+    if (KernelInfoTable[gpu].find(kernelName) == KernelInfoTable[gpu].end()) {
+      ErrorCheck(Finding the entry kernel info table,
+                 HSA_STATUS_ERROR_INVALID_CODE_OBJECT);
+    }
+    // found, so assign and update
+    info = KernelInfoTable[gpu][kernelName];
+
+    /* Extract dispatch information from the symbol */
+    err = hsa_executable_symbol_get_info(
+        symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
+        &(info.kernel_object));
+    ErrorCheck(Extracting the symbol from the executable, err);
+    err = hsa_executable_symbol_get_info(
+        symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
+        &(info.group_segment_size));
+    ErrorCheck(Extracting the group segment size from the executable, err);
+    err = hsa_executable_symbol_get_info(
+        symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
+        &(info.private_segment_size));
+    ErrorCheck(Extracting the private segment from the executable, err);
+
+    DEBUG_PRINT(
+        "Kernel %s --> %lx symbol %u group segsize %u pvt segsize %u bytes "
+        "kernarg\n",
+        kernelName.c_str(), info.kernel_object, info.group_segment_size,
+        info.private_segment_size, info.kernel_segment_size);
+
+    // assign it back to the kernel info table
+    KernelInfoTable[gpu][kernelName] = info;
+    free(name);
+  } else if (type == HSA_SYMBOL_KIND_VARIABLE) {
+    err = hsa_executable_symbol_get_info(
+        symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
+    ErrorCheck(Symbol info extraction, err);
+    char *name = reinterpret_cast<char *>(malloc(name_length + 1));
+    err = hsa_executable_symbol_get_info(symbol,
+                                         HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
+    ErrorCheck(Symbol info extraction, err);
+    name[name_length] = 0;
+
+    atl_symbol_info_t info;
+
+    err = hsa_executable_symbol_get_info(
+        symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &(info.addr));
+    ErrorCheck(Symbol info address extraction, err);
+
+    err = hsa_executable_symbol_get_info(
+        symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &(info.size));
+    ErrorCheck(Symbol info size extraction, err);
+
+    atmi_mem_place_t place = ATMI_MEM_PLACE(ATMI_DEVTYPE_GPU, gpu, 0);
+    DEBUG_PRINT("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr,
+                info.size);
+    register_allocation(reinterpret_cast<void *>(info.addr), (size_t)info.size,
+                        place);
+    SymbolInfoTable[gpu][std::string(name)] = info;
+    if (strcmp(name, "needs_hostcall_buffer") == 0)
+      g_atmi_hostcall_required = true;
+    free(name);
+  } else {
+    DEBUG_PRINT("Symbol is an indirect function\n");
+  }
+  return HSA_STATUS_SUCCESS;
+}
+
+atmi_status_t Runtime::RegisterModuleFromMemory(
+    void *module_bytes, size_t module_size, atmi_place_t place,
+    atmi_status_t (*on_deserialized_data)(void *data, size_t size,
+                                          void *cb_state),
+    void *cb_state) {
+  hsa_status_t err;
+  int gpu = place.device_id;
+  assert(gpu >= 0);
+
+  DEBUG_PRINT("Trying to load module to GPU-%d\n", gpu);
+  ATLGPUProcessor &proc = get_processor<ATLGPUProcessor>(place);
+  hsa_agent_t agent = proc.agent();
+  hsa_executable_t executable = {0};
+  hsa_profile_t agent_profile;
+
+  err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile);
+  ErrorCheck(Query the agent profile, err);
+  // FIXME: Assume that every profile is FULL until we understand how to build
+  // GCN with base profile
+  agent_profile = HSA_PROFILE_FULL;
+  /* Create the empty executable.  */
+  err = hsa_executable_create(agent_profile, HSA_EXECUTABLE_STATE_UNFROZEN, "",
+                              &executable);
+  ErrorCheck(Create the executable, err);
+
+  bool module_load_success = false;
+  do // Existing control flow used continue, preserve that for this patch
+  {
+    {
+      // Some metadata info is not available through ROCr API, so use custom
+      // code object metadata parsing to collect such metadata info
+
+      err = get_code_object_custom_metadata(module_bytes, module_size, gpu);
+      ErrorCheckAndContinue(Getting custom code object metadata, err);
+
+      // Deserialize code object.
+      hsa_code_object_t code_object = {0};
+      err = hsa_code_object_deserialize(module_bytes, module_size, NULL,
+                                        &code_object);
+      ErrorCheckAndContinue(Code Object Deserialization, err);
+      assert(0 != code_object.handle);
+
+      // Mutating the device image here avoids another allocation & memcpy
+      void *code_object_alloc_data =
+          reinterpret_cast<void *>(code_object.handle);
+      atmi_status_t atmi_err =
+          on_deserialized_data(code_object_alloc_data, module_size, cb_state);
+      ATMIErrorCheck(Error in deserialized_data callback, atmi_err);
+
+      /* Load the code object.  */
+      err =
+          hsa_executable_load_code_object(executable, agent, code_object, NULL);
+      ErrorCheckAndContinue(Loading the code object, err);
+
+      // cannot iterate over symbols until executable is frozen
+    }
+    module_load_success = true;
+  } while (0);
+  DEBUG_PRINT("Modules loaded successful? %d\n", module_load_success);
+  if (module_load_success) {
+    /* Freeze the executable; it can now be queried for symbols.  */
+    err = hsa_executable_freeze(executable, "");
+    ErrorCheck(Freeze the executable, err);
+
+    err = hsa_executable_iterate_symbols(executable, populate_InfoTables,
+                                         static_cast<void *>(&gpu));
+    ErrorCheck(Iterating over symbols for execuatable, err);
+
+    // save the executable and destroy during finalize
+    g_executables.push_back(executable);
+    return ATMI_STATUS_SUCCESS;
+  } else {
+    return ATMI_STATUS_ERROR;
+  }
+}
+
+} // namespace core
diff --git a/libomptarget/plugins/amdgpu/impl/utils.cpp b/libomptarget/plugins/amdgpu/impl/utils.cpp
new file mode 100644
index 000000000..2aa09ff47
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/impl/utils.cpp
@@ -0,0 +1,113 @@
+/*===--------------------------------------------------------------------------
+ *              ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#include "internal.h"
+#include "rt.h"
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <errno.h>
+#include <iostream>
+#include <pthread.h>
+#include <sched.h>
+#include <stdio.h>
+
+/*
+ * Helper functions
+ */
+const char *get_atmi_error_string(atmi_status_t err) {
+  switch (err) {
+  case ATMI_STATUS_SUCCESS:
+    return "ATMI_STATUS_SUCCESS";
+  case ATMI_STATUS_UNKNOWN:
+    return "ATMI_STATUS_UNKNOWN";
+  case ATMI_STATUS_ERROR:
+    return "ATMI_STATUS_ERROR";
+  default:
+    return "";
+  }
+}
+
+const char *get_error_string(hsa_status_t err) {
+  switch (err) {
+  case HSA_STATUS_SUCCESS:
+    return "HSA_STATUS_SUCCESS";
+  case HSA_STATUS_INFO_BREAK:
+    return "HSA_STATUS_INFO_BREAK";
+  case HSA_STATUS_ERROR:
+    return "HSA_STATUS_ERROR";
+  case HSA_STATUS_ERROR_INVALID_ARGUMENT:
+    return "HSA_STATUS_ERROR_INVALID_ARGUMENT";
+  case HSA_STATUS_ERROR_INVALID_QUEUE_CREATION:
+    return "HSA_STATUS_ERROR_INVALID_QUEUE_CREATION";
+  case HSA_STATUS_ERROR_INVALID_ALLOCATION:
+    return "HSA_STATUS_ERROR_INVALID_ALLOCATION";
+  case HSA_STATUS_ERROR_INVALID_AGENT:
+    return "HSA_STATUS_ERROR_INVALID_AGENT";
+  case HSA_STATUS_ERROR_INVALID_REGION:
+    return "HSA_STATUS_ERROR_INVALID_REGION";
+  case HSA_STATUS_ERROR_INVALID_SIGNAL:
+    return "HSA_STATUS_ERROR_INVALID_SIGNAL";
+  case HSA_STATUS_ERROR_INVALID_QUEUE:
+    return "HSA_STATUS_ERROR_INVALID_QUEUE";
+  case HSA_STATUS_ERROR_OUT_OF_RESOURCES:
+    return "HSA_STATUS_ERROR_OUT_OF_RESOURCES";
+  case HSA_STATUS_ERROR_INVALID_PACKET_FORMAT:
+    return "HSA_STATUS_ERROR_INVALID_PACKET_FORMAT";
+  case HSA_STATUS_ERROR_RESOURCE_FREE:
+    return "HSA_STATUS_ERROR_RESOURCE_FREE";
+  case HSA_STATUS_ERROR_NOT_INITIALIZED:
+    return "HSA_STATUS_ERROR_NOT_INITIALIZED";
+  case HSA_STATUS_ERROR_REFCOUNT_OVERFLOW:
+    return "HSA_STATUS_ERROR_REFCOUNT_OVERFLOW";
+  case HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS:
+    return "HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS";
+  case HSA_STATUS_ERROR_INVALID_INDEX:
+    return "HSA_STATUS_ERROR_INVALID_INDEX";
+  case HSA_STATUS_ERROR_INVALID_ISA:
+    return "HSA_STATUS_ERROR_INVALID_ISA";
+  case HSA_STATUS_ERROR_INVALID_ISA_NAME:
+    return "HSA_STATUS_ERROR_INVALID_ISA_NAME";
+  case HSA_STATUS_ERROR_INVALID_CODE_OBJECT:
+    return "HSA_STATUS_ERROR_INVALID_CODE_OBJECT";
+  case HSA_STATUS_ERROR_INVALID_EXECUTABLE:
+    return "HSA_STATUS_ERROR_INVALID_EXECUTABLE";
+  case HSA_STATUS_ERROR_FROZEN_EXECUTABLE:
+    return "HSA_STATUS_ERROR_FROZEN_EXECUTABLE";
+  case HSA_STATUS_ERROR_INVALID_SYMBOL_NAME:
+    return "HSA_STATUS_ERROR_INVALID_SYMBOL_NAME";
+  case HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED:
+    return "HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED";
+  case HSA_STATUS_ERROR_VARIABLE_UNDEFINED:
+    return "HSA_STATUS_ERROR_VARIABLE_UNDEFINED";
+  case HSA_STATUS_ERROR_EXCEPTION:
+    return "HSA_STATUS_ERROR_EXCEPTION";
+  }
+}
+
+namespace core {
+/*
+ * Environment variables
+ */
+void Environment::GetEnvAll() {
+  std::string var = GetEnv("ATMI_HELP");
+  if (!var.empty()) {
+    std::cout << "ATMI_MAX_HSA_QUEUE_SIZE : positive integer" << std::endl
+              << "ATMI_DEBUG : 1 for printing out trace/debug info"
+              << std::endl;
+    exit(0);
+  }
+
+  var = GetEnv("ATMI_MAX_HSA_QUEUE_SIZE");
+  if (!var.empty())
+    max_queue_size_ = std::stoi(var);
+
+  var = GetEnv("ATMI_DEBUG");
+  if (!var.empty())
+    debug_mode_ = std::stoi(var);
+}
+} // namespace core
diff --git a/libomptarget/plugins/amdgpu/src/rtl.cpp b/libomptarget/plugins/amdgpu/src/rtl.cpp
new file mode 100644
index 000000000..9453171e1
--- /dev/null
+++ b/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -0,0 +1,1940 @@
+//===----RTLs/hsa/src/rtl.cpp - Target RTLs Implementation -------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for hsa machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <assert.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <dlfcn.h>
+#include <elf.h>
+#include <ffi.h>
+#include <fstream>
+#include <iostream>
+#include <libelf.h>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <shared_mutex>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+// Header from ATMI interface
+#include "atmi_interop_hsa.h"
+#include "atmi_runtime.h"
+
+#include "internal.h"
+
+#include "Debug.h"
+#include "get_elf_mach_gfx_name.h"
+#include "omptargetplugin.h"
+
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
+
+#ifndef TARGET_NAME
+#define TARGET_NAME AMDHSA
+#endif
+#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
+
+// hostrpc interface, FIXME: consider moving to its own include these are
+// statically linked into amdgpu/plugin if present from hostrpc_services.a,
+// linked as --whole-archive to override the weak symbols that are used to
+// implement a fallback for toolchains that do not yet have a hostrpc library.
+extern "C" {
+unsigned long hostrpc_assign_buffer(hsa_agent_t agent, hsa_queue_t *this_Q,
+                                    uint32_t device_id);
+hsa_status_t hostrpc_init();
+hsa_status_t hostrpc_terminate();
+
+__attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; }
+__attribute__((weak)) hsa_status_t hostrpc_terminate() {
+  return HSA_STATUS_SUCCESS;
+}
+__attribute__((weak)) unsigned long
+hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, uint32_t device_id) {
+  DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "
+     "missing\n",
+     device_id);
+  return 0;
+}
+}
+
+int print_kernel_trace;
+
+// Size of the target call stack struture
+uint32_t TgtStackItemSize = 0;
+
+#undef check // Drop definition from internal.h
+#ifdef OMPTARGET_DEBUG
+#define check(msg, status)                                                     \
+  if (status != ATMI_STATUS_SUCCESS) {                                         \
+    /* fprintf(stderr, "[%s:%d] %s failed.\n", __FILE__, __LINE__, #msg);*/    \
+    DP(#msg " failed\n");                                                      \
+    /*assert(0);*/                                                             \
+  } else {                                                                     \
+    /* fprintf(stderr, "[%s:%d] %s succeeded.\n", __FILE__, __LINE__, #msg);   \
+     */                                                                        \
+    DP(#msg " succeeded\n");                                                   \
+  }
+#else
+#define check(msg, status)                                                     \
+  {}
+#endif
+
+#include "elf_common.h"
+
+/// Keep entries table per device
+struct FuncOrGblEntryTy {
+  __tgt_target_table Table;
+  std::vector<__tgt_offload_entry> Entries;
+};
+
+enum ExecutionModeType {
+  SPMD,    // constructors, destructors,
+           // combined constructs (`teams distribute parallel for [simd]`)
+  GENERIC, // everything else
+  NONE
+};
+
+struct KernelArgPool {
+private:
+  static pthread_mutex_t mutex;
+
+public:
+  uint32_t kernarg_segment_size;
+  void *kernarg_region = nullptr;
+  std::queue<int> free_kernarg_segments;
+
+  uint32_t kernarg_size_including_implicit() {
+    return kernarg_segment_size + sizeof(atmi_implicit_args_t);
+  }
+
+  ~KernelArgPool() {
+    if (kernarg_region) {
+      auto r = hsa_amd_memory_pool_free(kernarg_region);
+      assert(r == HSA_STATUS_SUCCESS);
+      ErrorCheck(Memory pool free, r);
+    }
+  }
+
+  // Can't really copy or move a mutex
+  KernelArgPool() = default;
+  KernelArgPool(const KernelArgPool &) = delete;
+  KernelArgPool(KernelArgPool &&) = delete;
+
+  KernelArgPool(uint32_t kernarg_segment_size)
+      : kernarg_segment_size(kernarg_segment_size) {
+
+    // atmi uses one pool per kernel for all gpus, with a fixed upper size
+    // preserving that exact scheme here, including the queue<int>
+    {
+      hsa_status_t err = hsa_amd_memory_pool_allocate(
+          atl_gpu_kernarg_pools[0],
+          kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0,
+          &kernarg_region);
+      ErrorCheck(Allocating memory for the executable-kernel, err);
+      core::allow_access_to_all_gpu_agents(kernarg_region);
+
+      for (int i = 0; i < MAX_NUM_KERNELS; i++) {
+        free_kernarg_segments.push(i);
+      }
+    }
+  }
+
+  void *allocate(uint64_t arg_num) {
+    assert((arg_num * sizeof(void *)) == kernarg_segment_size);
+    lock l(&mutex);
+    void *res = nullptr;
+    if (!free_kernarg_segments.empty()) {
+
+      int free_idx = free_kernarg_segments.front();
+      res = static_cast<void *>(static_cast<char *>(kernarg_region) +
+                                (free_idx * kernarg_size_including_implicit()));
+      assert(free_idx == pointer_to_index(res));
+      free_kernarg_segments.pop();
+    }
+    return res;
+  }
+
+  void deallocate(void *ptr) {
+    lock l(&mutex);
+    int idx = pointer_to_index(ptr);
+    free_kernarg_segments.push(idx);
+  }
+
+private:
+  int pointer_to_index(void *ptr) {
+    ptrdiff_t bytes =
+        static_cast<char *>(ptr) - static_cast<char *>(kernarg_region);
+    assert(bytes >= 0);
+    assert(bytes % kernarg_size_including_implicit() == 0);
+    return bytes / kernarg_size_including_implicit();
+  }
+  struct lock {
+    lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); }
+    ~lock() { pthread_mutex_unlock(m); }
+    pthread_mutex_t *m;
+  };
+};
+pthread_mutex_t KernelArgPool::mutex = PTHREAD_MUTEX_INITIALIZER;
+
+std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
+    KernelArgPoolMap;
+
+/// Use a single entity to encode a kernel and a set of flags
+struct KernelTy {
+  // execution mode of kernel
+  // 0 - SPMD mode (without master warp)
+  // 1 - Generic mode (with master warp)
+  int8_t ExecutionMode;
+  int16_t ConstWGSize;
+  int32_t device_id;
+  void *CallStackAddr = nullptr;
+  const char *Name;
+
+  KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int32_t _device_id,
+           void *_CallStackAddr, const char *_Name,
+           uint32_t _kernarg_segment_size)
+      : ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize),
+        device_id(_device_id), CallStackAddr(_CallStackAddr), Name(_Name) {
+    DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
+
+    std::string N(_Name);
+    if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
+      KernelArgPoolMap.insert(
+          std::make_pair(N, std::unique_ptr<KernelArgPool>(
+                                new KernelArgPool(_kernarg_segment_size))));
+    }
+  }
+};
+
+/// List that contains all the kernels.
+/// FIXME: we may need this to be per device and per library.
+std::list<KernelTy> KernelsList;
+
+// ATMI API to get gpu and gpu memory place
+static atmi_place_t get_gpu_place(int device_id) {
+  return ATMI_PLACE_GPU(0, device_id);
+}
+static atmi_mem_place_t get_gpu_mem_place(int device_id) {
+  return ATMI_MEM_PLACE_GPU_MEM(0, device_id, 0);
+}
+
+static std::vector<hsa_agent_t> find_gpu_agents() {
+  std::vector<hsa_agent_t> res;
+
+  hsa_status_t err = hsa_iterate_agents(
+      [](hsa_agent_t agent, void *data) -> hsa_status_t {
+        std::vector<hsa_agent_t> *res =
+            static_cast<std::vector<hsa_agent_t> *>(data);
+
+        hsa_device_type_t device_type;
+        // get_info fails iff HSA runtime not yet initialized
+        hsa_status_t err =
+            hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
+        if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS)
+          printf("rtl.cpp: err %d\n", err);
+        assert(err == HSA_STATUS_SUCCESS);
+
+        if (device_type == HSA_DEVICE_TYPE_GPU) {
+          res->push_back(agent);
+        }
+        return HSA_STATUS_SUCCESS;
+      },
+      &res);
+
+  // iterate_agents fails iff HSA runtime not yet initialized
+  if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS)
+    printf("rtl.cpp: err %d\n", err);
+  assert(err == HSA_STATUS_SUCCESS);
+  return res;
+}
+
+static void callbackQueue(hsa_status_t status, hsa_queue_t *source,
+                          void *data) {
+  if (status != HSA_STATUS_SUCCESS) {
+    const char *status_string;
+    if (hsa_status_string(status, &status_string) != HSA_STATUS_SUCCESS) {
+      status_string = "unavailable";
+    }
+    fprintf(stderr, "[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__,
+            __LINE__, source, status, status_string);
+    abort();
+  }
+}
+
+namespace core {
+void packet_store_release(uint32_t *packet, uint16_t header, uint16_t rest) {
+  __atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE);
+}
+
+uint16_t create_header(hsa_packet_type_t type, int barrier,
+                       atmi_task_fence_scope_t acq_fence,
+                       atmi_task_fence_scope_t rel_fence) {
+  uint16_t header = type << HSA_PACKET_HEADER_TYPE;
+  header |= barrier << HSA_PACKET_HEADER_BARRIER;
+  header |= (hsa_fence_scope_t) static_cast<int>(
+      acq_fence << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE);
+  header |= (hsa_fence_scope_t) static_cast<int>(
+      rel_fence << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+  return header;
+}
+} // namespace core
+
+/// Class containing all the device information
+class RTLDeviceInfoTy {
+  std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
+
+public:
+  // load binary populates symbol tables and mutates various global state
+  // run uses those symbol tables
+  std::shared_timed_mutex load_run_lock;
+
+  int NumberOfDevices;
+
+  // GPU devices
+  std::vector<hsa_agent_t> HSAAgents;
+  std::vector<hsa_queue_t *> HSAQueues; // one per gpu
+
+  // Device properties
+  std::vector<int> ComputeUnits;
+  std::vector<int> GroupsPerDevice;
+  std::vector<int> ThreadsPerGroup;
+  std::vector<int> WarpSize;
+  std::vector<std::string> GPUName;
+
+  // OpenMP properties
+  std::vector<int> NumTeams;
+  std::vector<int> NumThreads;
+
+  // OpenMP Environment properties
+  int EnvNumTeams;
+  int EnvTeamLimit;
+  int EnvMaxTeamsDefault;
+
+  // OpenMP Requires Flags
+  int64_t RequiresFlags;
+
+  // Resource pools
+  SignalPoolT FreeSignalPool;
+
+  struct atmiFreePtrDeletor {
+    void operator()(void *p) {
+      atmi_free(p); // ignore failure to free
+    }
+  };
+
+  // device_State shared across loaded binaries, error if inconsistent size
+  std::vector<std::pair<std::unique_ptr<void, atmiFreePtrDeletor>, uint64_t>>
+      deviceStateStore;
+
+  static const unsigned HardTeamLimit =
+      (1 << 16) - 1; // 64K needed to fit in uint16
+  static const int DefaultNumTeams = 128;
+  static const int Max_Teams =
+      llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_Teams];
+  static const int Warp_Size =
+      llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size];
+  static const int Max_WG_Size =
+      llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_WG_Size];
+  static const int Default_WG_Size =
+      llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size];
+
+  using MemcpyFunc = atmi_status_t (*)(hsa_signal_t, void *, const void *,
+                                       size_t size, hsa_agent_t);
+  atmi_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size,
+                                      MemcpyFunc Func, int32_t deviceId) {
+    hsa_agent_t agent = HSAAgents[deviceId];
+    hsa_signal_t s = FreeSignalPool.pop();
+    if (s.handle == 0) {
+      return ATMI_STATUS_ERROR;
+    }
+    atmi_status_t r = Func(s, dest, src, size, agent);
+    FreeSignalPool.push(s);
+    return r;
+  }
+
+  atmi_status_t freesignalpool_memcpy_d2h(void *dest, const void *src,
+                                          size_t size, int32_t deviceId) {
+    return freesignalpool_memcpy(dest, src, size, atmi_memcpy_d2h, deviceId);
+  }
+
+  atmi_status_t freesignalpool_memcpy_h2d(void *dest, const void *src,
+                                          size_t size, int32_t deviceId) {
+    return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d, deviceId);
+  }
+
+  // Record entry point associated with device
+  void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    E.Entries.push_back(entry);
+  }
+
+  // Return true if the entry is associated with device
+  bool findOffloadEntry(int32_t device_id, void *addr) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    for (auto &it : E.Entries) {
+      if (it.addr == addr)
+        return true;
+    }
+
+    return false;
+  }
+
+  // Return the pointer to the target entries table
+  __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    int32_t size = E.Entries.size();
+
+    // Table is empty
+    if (!size)
+      return 0;
+
+    __tgt_offload_entry *begin = &E.Entries[0];
+    __tgt_offload_entry *end = &E.Entries[size - 1];
+
+    // Update table info according to the entries and return the pointer
+    E.Table.EntriesBegin = begin;
+    E.Table.EntriesEnd = ++end;
+
+    return &E.Table;
+  }
+
+  // Clear entries table for a device
+  void clearOffloadEntriesTable(int device_id) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncGblEntries[device_id].emplace_back();
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+    // KernelArgPoolMap.clear();
+    E.Entries.clear();
+    E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
+  }
+
+  RTLDeviceInfoTy() {
+    // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
+    // anytime. You do not need a debug library build.
+    //  0 => no tracing
+    //  1 => tracing dispatch only
+    // >1 => verbosity increase
+    if (char *envStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
+      print_kernel_trace = atoi(envStr);
+    else
+      print_kernel_trace = 0;
+
+    DP("Start initializing HSA-ATMI\n");
+    atmi_status_t err = atmi_init();
+    if (err != ATMI_STATUS_SUCCESS) {
+      DP("Error when initializing HSA-ATMI\n");
+      return;
+    }
+    // Init hostcall soon after initializing ATMI
+    hostrpc_init();
+
+    HSAAgents = find_gpu_agents();
+    NumberOfDevices = (int)HSAAgents.size();
+
+    if (NumberOfDevices == 0) {
+      DP("There are no devices supporting HSA.\n");
+      return;
+    } else {
+      DP("There are %d devices supporting HSA.\n", NumberOfDevices);
+    }
+
+    // Init the device info
+    HSAQueues.resize(NumberOfDevices);
+    FuncGblEntries.resize(NumberOfDevices);
+    ThreadsPerGroup.resize(NumberOfDevices);
+    ComputeUnits.resize(NumberOfDevices);
+    GPUName.resize(NumberOfDevices);
+    GroupsPerDevice.resize(NumberOfDevices);
+    WarpSize.resize(NumberOfDevices);
+    NumTeams.resize(NumberOfDevices);
+    NumThreads.resize(NumberOfDevices);
+    deviceStateStore.resize(NumberOfDevices);
+
+    for (int i = 0; i < NumberOfDevices; i++) {
+      uint32_t queue_size = 0;
+      {
+        hsa_status_t err;
+        err = hsa_agent_get_info(HSAAgents[i], HSA_AGENT_INFO_QUEUE_MAX_SIZE,
+                                 &queue_size);
+        ErrorCheck(Querying the agent maximum queue size, err);
+        if (queue_size > core::Runtime::getInstance().getMaxQueueSize()) {
+          queue_size = core::Runtime::getInstance().getMaxQueueSize();
+        }
+      }
+
+      hsa_status_t rc = hsa_queue_create(
+          HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI, callbackQueue, NULL,
+          UINT32_MAX, UINT32_MAX, &HSAQueues[i]);
+      if (rc != HSA_STATUS_SUCCESS) {
+        DP("Failed to create HSA queues\n");
+        return;
+      }
+
+      deviceStateStore[i] = {nullptr, 0};
+    }
+
+    for (int i = 0; i < NumberOfDevices; i++) {
+      ThreadsPerGroup[i] = RTLDeviceInfoTy::Default_WG_Size;
+      GroupsPerDevice[i] = RTLDeviceInfoTy::DefaultNumTeams;
+      ComputeUnits[i] = 1;
+      DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", i,
+         GroupsPerDevice[i], ThreadsPerGroup[i]);
+    }
+
+    // Get environment variables regarding teams
+    char *envStr = getenv("OMP_TEAM_LIMIT");
+    if (envStr) {
+      // OMP_TEAM_LIMIT has been set
+      EnvTeamLimit = std::stoi(envStr);
+      DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
+    } else {
+      EnvTeamLimit = -1;
+    }
+    envStr = getenv("OMP_NUM_TEAMS");
+    if (envStr) {
+      // OMP_NUM_TEAMS has been set
+      EnvNumTeams = std::stoi(envStr);
+      DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
+    } else {
+      EnvNumTeams = -1;
+    }
+    // Get environment variables regarding expMaxTeams
+    envStr = getenv("OMP_MAX_TEAMS_DEFAULT");
+    if (envStr) {
+      EnvMaxTeamsDefault = std::stoi(envStr);
+      DP("Parsed OMP_MAX_TEAMS_DEFAULT=%d\n", EnvMaxTeamsDefault);
+    } else {
+      EnvMaxTeamsDefault = -1;
+    }
+
+    // Default state.
+    RequiresFlags = OMP_REQ_UNDEFINED;
+  }
+
+  ~RTLDeviceInfoTy() {
+    DP("Finalizing the HSA-ATMI DeviceInfo.\n");
+    // Run destructors on types that use HSA before
+    // atmi_finalize removes access to it
+    deviceStateStore.clear();
+    KernelArgPoolMap.clear();
+    // Terminate hostrpc before finalizing ATMI
+    hostrpc_terminate();
+    atmi_finalize();
+  }
+};
+
+pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
+
+// TODO: May need to drop the trailing to fields until deviceRTL is updated
+struct omptarget_device_environmentTy {
+  int32_t debug_level; // gets value of envvar LIBOMPTARGET_DEVICE_RTL_DEBUG
+                       // only useful for Debug build of deviceRTLs
+  int32_t num_devices; // gets number of active offload devices
+  int32_t device_num;  // gets a value 0 to num_devices-1
+};
+
+static RTLDeviceInfoTy DeviceInfo;
+
+namespace {
+
+int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
+                     __tgt_async_info *AsyncInfoPtr) {
+  assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+  assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
+  // Return success if we are not copying back to host from target.
+  if (!HstPtr)
+    return OFFLOAD_SUCCESS;
+  atmi_status_t err;
+  DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
+     (long long unsigned)(Elf64_Addr)TgtPtr,
+     (long long unsigned)(Elf64_Addr)HstPtr);
+
+  err = DeviceInfo.freesignalpool_memcpy_d2h(HstPtr, TgtPtr, (size_t)Size,
+                                             DeviceId);
+
+  if (err != ATMI_STATUS_SUCCESS) {
+    DP("Error when copying data from device to host. Pointers: "
+       "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
+       (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
+    return OFFLOAD_FAIL;
+  }
+  DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
+     (long long unsigned)(Elf64_Addr)TgtPtr,
+     (long long unsigned)(Elf64_Addr)HstPtr);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
+                   __tgt_async_info *AsyncInfoPtr) {
+  assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+  atmi_status_t err;
+  assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
+  // Return success if we are not doing host to target.
+  if (!HstPtr)
+    return OFFLOAD_SUCCESS;
+
+  DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
+     (long long unsigned)(Elf64_Addr)HstPtr,
+     (long long unsigned)(Elf64_Addr)TgtPtr);
+  err = DeviceInfo.freesignalpool_memcpy_h2d(TgtPtr, HstPtr, (size_t)Size,
+                                             DeviceId);
+  if (err != ATMI_STATUS_SUCCESS) {
+    DP("Error when copying data from host to device. Pointers: "
+       "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
+       (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+// Async.
+// The implementation was written with cuda streams in mind. The semantics of
+// that are to execute kernels on a queue in order of insertion. A synchronise
+// call then makes writes visible between host and device. This means a series
+// of N data_submit_async calls are expected to execute serially. HSA offers
+// various options to run the data copies concurrently. This may require changes
+// to libomptarget.
+
+// __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
+// there are no outstanding kernels that need to be synchronized. Any async call
+// may be passed a Queue==0, at which point the cuda implementation will set it
+// to non-null (see getStream). The cuda streams are per-device. Upstream may
+// change this interface to explicitly initialize the async_info_pointer, but
+// until then hsa lazily initializes it as well.
+
+void initAsyncInfoPtr(__tgt_async_info *async_info_ptr) {
+  // set non-null while using async calls, return to null to indicate completion
+  assert(async_info_ptr);
+  if (!async_info_ptr->Queue) {
+    async_info_ptr->Queue = reinterpret_cast<void *>(UINT64_MAX);
+  }
+}
+void finiAsyncInfoPtr(__tgt_async_info *async_info_ptr) {
+  assert(async_info_ptr);
+  assert(async_info_ptr->Queue);
+  async_info_ptr->Queue = 0;
+}
+
+bool elf_machine_id_is_amdgcn(__tgt_device_image *image) {
+  const uint16_t amdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h
+  int32_t r = elf_check_machine(image, amdgcnMachineID);
+  if (!r) {
+    DP("Supported machine ID not found\n");
+  }
+  return r;
+}
+
+uint32_t elf_e_flags(__tgt_device_image *image) {
+  char *img_begin = (char *)image->ImageStart;
+  size_t img_size = (char *)image->ImageEnd - img_begin;
+
+  Elf *e = elf_memory(img_begin, img_size);
+  if (!e) {
+    DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
+    return 0;
+  }
+
+  Elf64_Ehdr *eh64 = elf64_getehdr(e);
+
+  if (!eh64) {
+    DP("Unable to get machine ID from ELF file!\n");
+    elf_end(e);
+    return 0;
+  }
+
+  uint32_t Flags = eh64->e_flags;
+
+  elf_end(e);
+  DP("ELF Flags: 0x%x\n", Flags);
+  return Flags;
+}
+} // namespace
+
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+  return elf_machine_id_is_amdgcn(image);
+}
+
+int __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
+
+int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
+  DP("Init requires flags to %ld\n", RequiresFlags);
+  DeviceInfo.RequiresFlags = RequiresFlags;
+  return RequiresFlags;
+}
+
+int32_t __tgt_rtl_init_device(int device_id) {
+  hsa_status_t err;
+
+  // this is per device id init
+  DP("Initialize the device id: %d\n", device_id);
+
+  hsa_agent_t agent = DeviceInfo.HSAAgents[device_id];
+
+  // Get number of Compute Unit
+  uint32_t compute_units = 0;
+  err = hsa_agent_get_info(
+      agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
+      &compute_units);
+  if (err != HSA_STATUS_SUCCESS) {
+    DeviceInfo.ComputeUnits[device_id] = 1;
+    DP("Error getting compute units : settiing to 1\n");
+  } else {
+    DeviceInfo.ComputeUnits[device_id] = compute_units;
+    DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]);
+  }
+
+  char GetInfoName[64]; // 64 max size returned by get info
+  err = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
+                           (void *)GetInfoName);
+  if (err)
+    DeviceInfo.GPUName[device_id] = "--unknown gpu--";
+  else {
+    DeviceInfo.GPUName[device_id] = GetInfoName;
+  }
+
+  if (print_kernel_trace == 4)
+    fprintf(stderr, "Device#%-2d CU's: %2d %s\n", device_id,
+            DeviceInfo.ComputeUnits[device_id],
+            DeviceInfo.GPUName[device_id].c_str());
+
+  // Query attributes to determine number of threads/block and blocks/grid.
+  uint16_t workgroup_max_dim[3];
+  err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
+                           &workgroup_max_dim);
+  if (err != HSA_STATUS_SUCCESS) {
+    DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
+    DP("Error getting grid dims: num groups : %d\n",
+       RTLDeviceInfoTy::DefaultNumTeams);
+  } else if (workgroup_max_dim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
+    DeviceInfo.GroupsPerDevice[device_id] = workgroup_max_dim[0];
+    DP("Using %d ROCm blocks per grid\n",
+       DeviceInfo.GroupsPerDevice[device_id]);
+  } else {
+    DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::HardTeamLimit;
+    DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "
+       "at the hard limit\n",
+       workgroup_max_dim[0], RTLDeviceInfoTy::HardTeamLimit);
+  }
+
+  // Get thread limit
+  hsa_dim3_t grid_max_dim;
+  err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM, &grid_max_dim);
+  if (err == HSA_STATUS_SUCCESS) {
+    DeviceInfo.ThreadsPerGroup[device_id] =
+        reinterpret_cast<uint32_t *>(&grid_max_dim)[0] /
+        DeviceInfo.GroupsPerDevice[device_id];
+    if ((DeviceInfo.ThreadsPerGroup[device_id] >
+         RTLDeviceInfoTy::Max_WG_Size) ||
+        DeviceInfo.ThreadsPerGroup[device_id] == 0) {
+      DP("Capped thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size);
+      DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size;
+    } else {
+      DP("Using ROCm Queried thread limit: %d\n",
+         DeviceInfo.ThreadsPerGroup[device_id]);
+    }
+  } else {
+    DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size;
+    DP("Error getting max block dimension, use default:%d \n",
+       RTLDeviceInfoTy::Max_WG_Size);
+  }
+
+  // Get wavefront size
+  uint32_t wavefront_size = 0;
+  err =
+      hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size);
+  if (err == HSA_STATUS_SUCCESS) {
+    DP("Queried wavefront size: %d\n", wavefront_size);
+    DeviceInfo.WarpSize[device_id] = wavefront_size;
+  } else {
+    DP("Default wavefront size: %d\n",
+       llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]);
+    DeviceInfo.WarpSize[device_id] =
+        llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size];
+  }
+
+  // Adjust teams to the env variables
+  if (DeviceInfo.EnvTeamLimit > 0 &&
+      DeviceInfo.GroupsPerDevice[device_id] > DeviceInfo.EnvTeamLimit) {
+    DeviceInfo.GroupsPerDevice[device_id] = DeviceInfo.EnvTeamLimit;
+    DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",
+       DeviceInfo.EnvTeamLimit);
+  }
+
+  // Set default number of teams
+  if (DeviceInfo.EnvNumTeams > 0) {
+    DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
+    DP("Default number of teams set according to environment %d\n",
+       DeviceInfo.EnvNumTeams);
+  } else {
+    char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
+    int TeamsPerCU = 1; // default number of teams per CU is 1
+    if (TeamsPerCUEnvStr) {
+      TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
+    }
+
+    DeviceInfo.NumTeams[device_id] =
+        TeamsPerCU * DeviceInfo.ComputeUnits[device_id];
+    DP("Default number of teams = %d * number of compute units %d\n",
+       TeamsPerCU, DeviceInfo.ComputeUnits[device_id]);
+  }
+
+  if (DeviceInfo.NumTeams[device_id] > DeviceInfo.GroupsPerDevice[device_id]) {
+    DeviceInfo.NumTeams[device_id] = DeviceInfo.GroupsPerDevice[device_id];
+    DP("Default number of teams exceeds device limit, capping at %d\n",
+       DeviceInfo.GroupsPerDevice[device_id]);
+  }
+
+  // Set default number of threads
+  DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::Default_WG_Size;
+  DP("Default number of threads set according to library's default %d\n",
+     RTLDeviceInfoTy::Default_WG_Size);
+  if (DeviceInfo.NumThreads[device_id] >
+      DeviceInfo.ThreadsPerGroup[device_id]) {
+    DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerGroup[device_id];
+    DP("Default number of threads exceeds device limit, capping at %d\n",
+       DeviceInfo.ThreadsPerGroup[device_id]);
+  }
+
+  DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",
+     device_id, DeviceInfo.GroupsPerDevice[device_id],
+     DeviceInfo.ThreadsPerGroup[device_id]);
+
+  DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", device_id,
+     DeviceInfo.WarpSize[device_id], DeviceInfo.ThreadsPerGroup[device_id],
+     DeviceInfo.GroupsPerDevice[device_id],
+     DeviceInfo.GroupsPerDevice[device_id] *
+         DeviceInfo.ThreadsPerGroup[device_id]);
+
+  return OFFLOAD_SUCCESS;
+}
+
+namespace {
+Elf64_Shdr *find_only_SHT_HASH(Elf *elf) {
+  size_t N;
+  int rc = elf_getshdrnum(elf, &N);
+  if (rc != 0) {
+    return nullptr;
+  }
+
+  Elf64_Shdr *result = nullptr;
+  for (size_t i = 0; i < N; i++) {
+    Elf_Scn *scn = elf_getscn(elf, i);
+    if (scn) {
+      Elf64_Shdr *shdr = elf64_getshdr(scn);
+      if (shdr) {
+        if (shdr->sh_type == SHT_HASH) {
+          if (result == nullptr) {
+            result = shdr;
+          } else {
+            // multiple SHT_HASH sections not handled
+            return nullptr;
+          }
+        }
+      }
+    }
+  }
+  return result;
+}
+
+const Elf64_Sym *elf_lookup(Elf *elf, char *base, Elf64_Shdr *section_hash,
+                            const char *symname) {
+
+  assert(section_hash);
+  size_t section_symtab_index = section_hash->sh_link;
+  Elf64_Shdr *section_symtab =
+      elf64_getshdr(elf_getscn(elf, section_symtab_index));
+  size_t section_strtab_index = section_symtab->sh_link;
+
+  const Elf64_Sym *symtab =
+      reinterpret_cast<const Elf64_Sym *>(base + section_symtab->sh_offset);
+
+  const uint32_t *hashtab =
+      reinterpret_cast<const uint32_t *>(base + section_hash->sh_offset);
+
+  // Layout:
+  // nbucket
+  // nchain
+  // bucket[nbucket]
+  // chain[nchain]
+  uint32_t nbucket = hashtab[0];
+  const uint32_t *bucket = &hashtab[2];
+  const uint32_t *chain = &hashtab[nbucket + 2];
+
+  const size_t max = strlen(symname) + 1;
+  const uint32_t hash = elf_hash(symname);
+  for (uint32_t i = bucket[hash % nbucket]; i != 0; i = chain[i]) {
+    char *n = elf_strptr(elf, section_strtab_index, symtab[i].st_name);
+    if (strncmp(symname, n, max) == 0) {
+      return &symtab[i];
+    }
+  }
+
+  return nullptr;
+}
+
+typedef struct {
+  void *addr = nullptr;
+  uint32_t size = UINT32_MAX;
+  uint32_t sh_type = SHT_NULL;
+} symbol_info;
+
+int get_symbol_info_without_loading(Elf *elf, char *base, const char *symname,
+                                    symbol_info *res) {
+  if (elf_kind(elf) != ELF_K_ELF) {
+    return 1;
+  }
+
+  Elf64_Shdr *section_hash = find_only_SHT_HASH(elf);
+  if (!section_hash) {
+    return 1;
+  }
+
+  const Elf64_Sym *sym = elf_lookup(elf, base, section_hash, symname);
+  if (!sym) {
+    return 1;
+  }
+
+  if (sym->st_size > UINT32_MAX) {
+    return 1;
+  }
+
+  if (sym->st_shndx == SHN_UNDEF) {
+    return 1;
+  }
+
+  Elf_Scn *section = elf_getscn(elf, sym->st_shndx);
+  if (!section) {
+    return 1;
+  }
+
+  Elf64_Shdr *header = elf64_getshdr(section);
+  if (!header) {
+    return 1;
+  }
+
+  res->addr = sym->st_value + base;
+  res->size = static_cast<uint32_t>(sym->st_size);
+  res->sh_type = header->sh_type;
+  return 0;
+}
+
+int get_symbol_info_without_loading(char *base, size_t img_size,
+                                    const char *symname, symbol_info *res) {
+  Elf *elf = elf_memory(base, img_size);
+  if (elf) {
+    int rc = get_symbol_info_without_loading(elf, base, symname, res);
+    elf_end(elf);
+    return rc;
+  }
+  return 1;
+}
+
+atmi_status_t interop_get_symbol_info(char *base, size_t img_size,
+                                      const char *symname, void **var_addr,
+                                      uint32_t *var_size) {
+  symbol_info si;
+  int rc = get_symbol_info_without_loading(base, img_size, symname, &si);
+  if (rc == 0) {
+    *var_addr = si.addr;
+    *var_size = si.size;
+    return ATMI_STATUS_SUCCESS;
+  } else {
+    return ATMI_STATUS_ERROR;
+  }
+}
+
+template <typename C>
+atmi_status_t module_register_from_memory_to_place(void *module_bytes,
+                                                   size_t module_size,
+                                                   atmi_place_t place, C cb) {
+  auto L = [](void *data, size_t size, void *cb_state) -> atmi_status_t {
+    C *unwrapped = static_cast<C *>(cb_state);
+    return (*unwrapped)(data, size);
+  };
+  return atmi_module_register_from_memory_to_place(
+      module_bytes, module_size, place, L, static_cast<void *>(&cb));
+}
+} // namespace
+
+static uint64_t get_device_State_bytes(char *ImageStart, size_t img_size) {
+  uint64_t device_State_bytes = 0;
+  {
+    // If this is the deviceRTL, get the state variable size
+    symbol_info size_si;
+    int rc = get_symbol_info_without_loading(
+        ImageStart, img_size, "omptarget_nvptx_device_State_size", &size_si);
+
+    if (rc == 0) {
+      if (size_si.size != sizeof(uint64_t)) {
+        fprintf(stderr,
+                "Found device_State_size variable with wrong size, aborting\n");
+        exit(1);
+      }
+
+      // Read number of bytes directly from the elf
+      memcpy(&device_State_bytes, size_si.addr, sizeof(uint64_t));
+    }
+  }
+  return device_State_bytes;
+}
+
+static __tgt_target_table *
+__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
+
+static __tgt_target_table *
+__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
+
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
+                                          __tgt_device_image *image) {
+  DeviceInfo.load_run_lock.lock();
+  __tgt_target_table *res = __tgt_rtl_load_binary_locked(device_id, image);
+  DeviceInfo.load_run_lock.unlock();
+  return res;
+}
+
+struct device_environment {
+  // initialise an omptarget_device_environmentTy in the deviceRTL
+  // patches around differences in the deviceRTL between trunk, aomp,
+  // rocmcc. Over time these differences will tend to zero and this class
+  // simplified.
+  // Symbol may be in .data or .bss, and may be missing fields:
+  //  - aomp has debug_level, num_devices, device_num
+  //  - trunk has debug_level
+  //  - under review in trunk is debug_level, device_num
+  //  - rocmcc matches aomp, patch to swap num_devices and device_num
+
+  // If the symbol is in .data (aomp, rocm) it can be written directly.
+  // If it is in .bss, we must wait for it to be allocated space on the
+  // gpu (trunk) and initialize after loading.
+  const char *sym() { return "omptarget_device_environment"; }
+
+  omptarget_device_environmentTy host_device_env;
+  symbol_info si;
+  bool valid = false;
+
+  __tgt_device_image *image;
+  const size_t img_size;
+
+  device_environment(int device_id, int number_devices,
+                     __tgt_device_image *image, const size_t img_size)
+      : image(image), img_size(img_size) {
+
+    host_device_env.num_devices = number_devices;
+    host_device_env.device_num = device_id;
+    host_device_env.debug_level = 0;
+#ifdef OMPTARGET_DEBUG
+    if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
+      host_device_env.debug_level = std::stoi(envStr);
+    }
+#endif
+
+    int rc = get_symbol_info_without_loading((char *)image->ImageStart,
+                                             img_size, sym(), &si);
+    if (rc != 0) {
+      DP("Finding global device environment '%s' - symbol missing.\n", sym());
+      return;
+    }
+
+    if (si.size > sizeof(host_device_env)) {
+      DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), si.size,
+         sizeof(host_device_env));
+      return;
+    }
+
+    valid = true;
+  }
+
+  bool in_image() { return si.sh_type != SHT_NOBITS; }
+
+  atmi_status_t before_loading(void *data, size_t size) {
+    assert(valid);
+    if (in_image()) {
+      DP("Setting global device environment before load (%u bytes)\n", si.size);
+      uint64_t offset = (char *)si.addr - (char *)image->ImageStart;
+      void *pos = (char *)data + offset;
+      memcpy(pos, &host_device_env, si.size);
+    }
+    return ATMI_STATUS_SUCCESS;
+  }
+
+  atmi_status_t after_loading() {
+    assert(valid);
+    if (!in_image()) {
+      DP("Setting global device environment after load (%u bytes)\n", si.size);
+      int device_id = host_device_env.device_num;
+
+      void *state_ptr;
+      uint32_t state_ptr_size;
+      atmi_status_t err = atmi_interop_hsa_get_symbol_info(
+          get_gpu_mem_place(device_id), sym(), &state_ptr, &state_ptr_size);
+      if (err != ATMI_STATUS_SUCCESS) {
+        DP("failed to find %s in loaded image\n", sym());
+        return err;
+      }
+
+      if (state_ptr_size != si.size) {
+        DP("Symbol had size %u before loading, %u after\n", state_ptr_size,
+           si.size);
+        return ATMI_STATUS_ERROR;
+      }
+
+      return DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &host_device_env,
+                                                  state_ptr_size, device_id);
+    }
+    return ATMI_STATUS_SUCCESS;
+  }
+};
+
+static atmi_status_t atmi_calloc(void **ret_ptr, size_t size,
+                                 atmi_mem_place_t place) {
+  uint64_t rounded = 4 * ((size + 3) / 4);
+  void *ptr;
+  atmi_status_t err = atmi_malloc(&ptr, rounded, place);
+  if (err != ATMI_STATUS_SUCCESS) {
+    return err;
+  }
+
+  hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, rounded / 4);
+  if (rc != HSA_STATUS_SUCCESS) {
+    fprintf(stderr, "zero fill device_state failed with %u\n", rc);
+    atmi_free(ptr);
+    return ATMI_STATUS_ERROR;
+  }
+
+  *ret_ptr = ptr;
+  return ATMI_STATUS_SUCCESS;
+}
+
+__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
+                                                 __tgt_device_image *image) {
+  // This function loads the device image onto gpu[device_id] and does other
+  // per-image initialization work. Specifically:
+  //
+  // - Initialize an omptarget_device_environmentTy instance embedded in the
+  //   image at the symbol "omptarget_device_environment"
+  //   Fields debug_level, device_num, num_devices. Used by the deviceRTL.
+  //
+  // - Allocate a large array per-gpu (could be moved to init_device)
+  //   - Read a uint64_t at symbol omptarget_nvptx_device_State_size
+  //   - Allocate at least that many bytes of gpu memory
+  //   - Zero initialize it
+  //   - Write the pointer to the symbol omptarget_nvptx_device_State
+  //
+  // - Pulls some per-kernel information together from various sources and
+  //   records it in the KernelsList for quicker access later
+  //
+  // The initialization can be done before or after loading the image onto the
+  // gpu. This function presently does a mixture. Using the hsa api to get/set
+  // the information is simpler to implement, in exchange for more complicated
+  // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
+  // back from the gpu vs a hashtable lookup on the host.
+
+  const size_t img_size = (char *)image->ImageEnd - (char *)image->ImageStart;
+
+  DeviceInfo.clearOffloadEntriesTable(device_id);
+
+  // We do not need to set the ELF version because the caller of this function
+  // had to do that to decide the right runtime to use
+
+  if (!elf_machine_id_is_amdgcn(image)) {
+    return NULL;
+  }
+
+  {
+    auto env = device_environment(device_id, DeviceInfo.NumberOfDevices, image,
+                                  img_size);
+    if (!env.valid) {
+      return NULL;
+    }
+
+    atmi_status_t err = module_register_from_memory_to_place(
+        (void *)image->ImageStart, img_size, get_gpu_place(device_id),
+        [&](void *data, size_t size) {
+          return env.before_loading(data, size);
+        });
+
+    check("Module registering", err);
+    if (err != ATMI_STATUS_SUCCESS) {
+      fprintf(stderr,
+              "Possible gpu arch mismatch: device:%s, image:%s please check"
+              " compiler flag: -march=<gpu>\n",
+              DeviceInfo.GPUName[device_id].c_str(),
+              get_elf_mach_gfx_name(elf_e_flags(image)));
+      return NULL;
+    }
+
+    err = env.after_loading();
+    if (err != ATMI_STATUS_SUCCESS) {
+      return NULL;
+    }
+  }
+
+  DP("ATMI module successfully loaded!\n");
+
+  {
+    // the device_State array is either large value in bss or a void* that
+    // needs to be assigned to a pointer to an array of size device_state_bytes
+
+    void *state_ptr;
+    uint32_t state_ptr_size;
+    atmi_status_t err = atmi_interop_hsa_get_symbol_info(
+        get_gpu_mem_place(device_id), "omptarget_nvptx_device_State",
+        &state_ptr, &state_ptr_size);
+
+    if (err != ATMI_STATUS_SUCCESS) {
+      fprintf(stderr, "failed to find device_state symbol\n");
+      return NULL;
+    }
+
+    if (state_ptr_size < sizeof(void *)) {
+      fprintf(stderr, "unexpected size of state_ptr %u != %zu\n",
+              state_ptr_size, sizeof(void *));
+      return NULL;
+    }
+
+    // if it's larger than a void*, assume it's a bss array and no further
+    // initialization is required. Only try to set up a pointer for
+    // sizeof(void*)
+    if (state_ptr_size == sizeof(void *)) {
+      uint64_t device_State_bytes =
+          get_device_State_bytes((char *)image->ImageStart, img_size);
+      if (device_State_bytes == 0) {
+        return NULL;
+      }
+
+      auto &dss = DeviceInfo.deviceStateStore[device_id];
+      if (dss.first.get() == nullptr) {
+        assert(dss.second == 0);
+        void *ptr = NULL;
+        atmi_status_t err =
+            atmi_calloc(&ptr, device_State_bytes, get_gpu_mem_place(device_id));
+        if (err != ATMI_STATUS_SUCCESS) {
+          fprintf(stderr, "Failed to allocate device_state array\n");
+          return NULL;
+        }
+        dss = {std::unique_ptr<void, RTLDeviceInfoTy::atmiFreePtrDeletor>{ptr},
+               device_State_bytes};
+      }
+
+      void *ptr = dss.first.get();
+      if (device_State_bytes != dss.second) {
+        fprintf(stderr, "Inconsistent sizes of device_State unsupported\n");
+        exit(1);
+      }
+
+      // write ptr to device memory so it can be used by later kernels
+      err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr,
+                                                 sizeof(void *), device_id);
+      if (err != ATMI_STATUS_SUCCESS) {
+        fprintf(stderr, "memcpy install of state_ptr failed\n");
+        return NULL;
+      }
+    }
+  }
+
+  // TODO: Check with Guansong to understand the below comment more thoroughly.
+  // Here, we take advantage of the data that is appended after img_end to get
+  // the symbols' name we need to load. This data consist of the host entries
+  // begin and end as well as the target name (see the offloading linker script
+  // creation in clang compiler).
+
+  // Find the symbols in the module by name. The name can be obtain by
+  // concatenating the host entry name with the target name
+
+  __tgt_offload_entry *HostBegin = image->EntriesBegin;
+  __tgt_offload_entry *HostEnd = image->EntriesEnd;
+
+  for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
+
+    if (!e->addr) {
+      // The host should have always something in the address to
+      // uniquely identify the target region.
+      fprintf(stderr, "Analyzing host entry '<null>' (size = %lld)...\n",
+              (unsigned long long)e->size);
+      return NULL;
+    }
+
+    if (e->size) {
+      __tgt_offload_entry entry = *e;
+
+      void *varptr;
+      uint32_t varsize;
+
+      atmi_status_t err = atmi_interop_hsa_get_symbol_info(
+          get_gpu_mem_place(device_id), e->name, &varptr, &varsize);
+
+      if (err != ATMI_STATUS_SUCCESS) {
+        DP("Loading global '%s' (Failed)\n", e->name);
+        // Inform the user what symbol prevented offloading
+        fprintf(stderr, "Loading global '%s' (Failed)\n", e->name);
+        return NULL;
+      }
+
+      if (varsize != e->size) {
+        DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name,
+           varsize, e->size);
+        return NULL;
+      }
+
+      DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
+         DPxPTR(e - HostBegin), e->name, DPxPTR(varptr));
+      entry.addr = (void *)varptr;
+
+      DeviceInfo.addOffloadEntry(device_id, entry);
+
+      if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+          e->flags & OMP_DECLARE_TARGET_LINK) {
+        // If unified memory is present any target link variables
+        // can access host addresses directly. There is no longer a
+        // need for device copies.
+        err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr,
+                                                   sizeof(void *), device_id);
+        if (err != ATMI_STATUS_SUCCESS)
+          DP("Error when copying USM\n");
+        DP("Copy linked variable host address (" DPxMOD ")"
+           "to device address (" DPxMOD ")\n",
+           DPxPTR(*((void **)e->addr)), DPxPTR(varptr));
+      }
+
+      continue;
+    }
+
+    DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name));
+
+    atmi_mem_place_t place = get_gpu_mem_place(device_id);
+    uint32_t kernarg_segment_size;
+    atmi_status_t err = atmi_interop_hsa_get_kernel_info(
+        place, e->name, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
+        &kernarg_segment_size);
+
+    // each arg is a void * in this openmp implementation
+    uint32_t arg_num = kernarg_segment_size / sizeof(void *);
+    std::vector<size_t> arg_sizes(arg_num);
+    for (std::vector<size_t>::iterator it = arg_sizes.begin();
+         it != arg_sizes.end(); it++) {
+      *it = sizeof(void *);
+    }
+
+    // default value GENERIC (in case symbol is missing from cubin file)
+    int8_t ExecModeVal = ExecutionModeType::GENERIC;
+
+    // get flat group size if present, else Default_WG_Size
+    int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
+
+    // get Kernel Descriptor if present.
+    // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
+    struct KernDescValType {
+      uint16_t Version;
+      uint16_t TSize;
+      uint16_t WG_Size;
+      uint8_t Mode;
+    };
+    struct KernDescValType KernDescVal;
+    std::string KernDescNameStr(e->name);
+    KernDescNameStr += "_kern_desc";
+    const char *KernDescName = KernDescNameStr.c_str();
+
+    void *KernDescPtr;
+    uint32_t KernDescSize;
+    void *CallStackAddr = nullptr;
+    err = interop_get_symbol_info((char *)image->ImageStart, img_size,
+                                  KernDescName, &KernDescPtr, &KernDescSize);
+
+    if (err == ATMI_STATUS_SUCCESS) {
+      if ((size_t)KernDescSize != sizeof(KernDescVal))
+        DP("Loading global computation properties '%s' - size mismatch (%u != "
+           "%lu)\n",
+           KernDescName, KernDescSize, sizeof(KernDescVal));
+
+      memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
+
+      // Check structure size against recorded size.
+      if ((size_t)KernDescSize != KernDescVal.TSize)
+        DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
+           sizeof(KernDescVal), KernDescVal.TSize, KernDescName);
+
+      DP("After loading global for %s KernDesc \n", KernDescName);
+      DP("KernDesc: Version: %d\n", KernDescVal.Version);
+      DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
+      DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size);
+      DP("KernDesc: Mode: %d\n", KernDescVal.Mode);
+
+      // Get ExecMode
+      ExecModeVal = KernDescVal.Mode;
+      DP("ExecModeVal %d\n", ExecModeVal);
+      if (KernDescVal.WG_Size == 0) {
+        KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size;
+        DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size);
+      }
+      WGSizeVal = KernDescVal.WG_Size;
+      DP("WGSizeVal %d\n", WGSizeVal);
+      check("Loading KernDesc computation property", err);
+    } else {
+      DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
+
+      // Generic
+      std::string ExecModeNameStr(e->name);
+      ExecModeNameStr += "_exec_mode";
+      const char *ExecModeName = ExecModeNameStr.c_str();
+
+      void *ExecModePtr;
+      uint32_t varsize;
+      err = interop_get_symbol_info((char *)image->ImageStart, img_size,
+                                    ExecModeName, &ExecModePtr, &varsize);
+
+      if (err == ATMI_STATUS_SUCCESS) {
+        if ((size_t)varsize != sizeof(int8_t)) {
+          DP("Loading global computation properties '%s' - size mismatch(%u != "
+             "%lu)\n",
+             ExecModeName, varsize, sizeof(int8_t));
+          return NULL;
+        }
+
+        memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize);
+
+        DP("After loading global for %s ExecMode = %d\n", ExecModeName,
+           ExecModeVal);
+
+        if (ExecModeVal < 0 || ExecModeVal > 1) {
+          DP("Error wrong exec_mode value specified in HSA code object file: "
+             "%d\n",
+             ExecModeVal);
+          return NULL;
+        }
+      } else {
+        DP("Loading global exec_mode '%s' - symbol missing, using default "
+           "value "
+           "GENERIC (1)\n",
+           ExecModeName);
+      }
+      check("Loading computation property", err);
+
+      // Flat group size
+      std::string WGSizeNameStr(e->name);
+      WGSizeNameStr += "_wg_size";
+      const char *WGSizeName = WGSizeNameStr.c_str();
+
+      void *WGSizePtr;
+      uint32_t WGSize;
+      err = interop_get_symbol_info((char *)image->ImageStart, img_size,
+                                    WGSizeName, &WGSizePtr, &WGSize);
+
+      if (err == ATMI_STATUS_SUCCESS) {
+        if ((size_t)WGSize != sizeof(int16_t)) {
+          DP("Loading global computation properties '%s' - size mismatch (%u "
+             "!= "
+             "%lu)\n",
+             WGSizeName, WGSize, sizeof(int16_t));
+          return NULL;
+        }
+
+        memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
+
+        DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);
+
+        if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size ||
+            WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) {
+          DP("Error wrong WGSize value specified in HSA code object file: "
+             "%d\n",
+             WGSizeVal);
+          WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
+        }
+      } else {
+        DP("Warning: Loading WGSize '%s' - symbol not found, "
+           "using default value %d\n",
+           WGSizeName, WGSizeVal);
+      }
+
+      check("Loading WGSize computation property", err);
+    }
+
+    KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id,
+                                   CallStackAddr, e->name,
+                                   kernarg_segment_size));
+    __tgt_offload_entry entry = *e;
+    entry.addr = (void *)&KernelsList.back();
+    DeviceInfo.addOffloadEntry(device_id, entry);
+    DP("Entry point %ld maps to %s\n", e - HostBegin, e->name);
+  }
+
+  return DeviceInfo.getOffloadEntriesTable(device_id);
+}
+
+void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *) {
+  void *ptr = NULL;
+  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+  atmi_status_t err = atmi_malloc(&ptr, size, get_gpu_mem_place(device_id));
+  DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size,
+     (long long unsigned)(Elf64_Addr)ptr);
+  ptr = (err == ATMI_STATUS_SUCCESS) ? ptr : NULL;
+  return ptr;
+}
+
+int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr,
+                              int64_t size) {
+  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+  __tgt_async_info async_info;
+  int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &async_info);
+  if (rc != OFFLOAD_SUCCESS)
+    return OFFLOAD_FAIL;
+
+  return __tgt_rtl_synchronize(device_id, &async_info);
+}
+
+int32_t __tgt_rtl_data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr,
+                                    int64_t size,
+                                    __tgt_async_info *async_info_ptr) {
+  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+  if (async_info_ptr) {
+    initAsyncInfoPtr(async_info_ptr);
+    return dataSubmit(device_id, tgt_ptr, hst_ptr, size, async_info_ptr);
+  } else {
+    return __tgt_rtl_data_submit(device_id, tgt_ptr, hst_ptr, size);
+  }
+}
+
+int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr,
+                                int64_t size) {
+  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+  __tgt_async_info async_info;
+  int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &async_info);
+  if (rc != OFFLOAD_SUCCESS)
+    return OFFLOAD_FAIL;
+
+  return __tgt_rtl_synchronize(device_id, &async_info);
+}
+
+int32_t __tgt_rtl_data_retrieve_async(int device_id, void *hst_ptr,
+                                      void *tgt_ptr, int64_t size,
+                                      __tgt_async_info *async_info_ptr) {
+  assert(async_info_ptr && "async_info is nullptr");
+  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+  initAsyncInfoPtr(async_info_ptr);
+  return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, async_info_ptr);
+}
+
+int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) {
+  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+  atmi_status_t err;
+  DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr);
+  err = atmi_free(tgt_ptr);
+  if (err != ATMI_STATUS_SUCCESS) {
+    DP("Error when freeing CUDA memory\n");
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+// Determine launch values for threadsPerGroup and num_groups.
+// Outputs: treadsPerGroup, num_groups
+// Inputs: Max_Teams, Max_WG_Size, Warp_Size, ExecutionMode,
+//         EnvTeamLimit, EnvNumTeams, num_teams, thread_limit,
+//         loop_tripcount.
+void getLaunchVals(int &threadsPerGroup, int &num_groups, int ConstWGSize,
+                   int ExecutionMode, int EnvTeamLimit, int EnvNumTeams,
+                   int num_teams, int thread_limit, uint64_t loop_tripcount,
+                   int32_t device_id) {
+
+  int Max_Teams = DeviceInfo.EnvMaxTeamsDefault > 0
+                      ? DeviceInfo.EnvMaxTeamsDefault
+                      : DeviceInfo.NumTeams[device_id];
+  if (Max_Teams > DeviceInfo.HardTeamLimit)
+    Max_Teams = DeviceInfo.HardTeamLimit;
+
+  if (print_kernel_trace == 4) {
+    fprintf(stderr, "RTLDeviceInfoTy::Max_Teams: %d\n",
+            RTLDeviceInfoTy::Max_Teams);
+    fprintf(stderr, "Max_Teams: %d\n", Max_Teams);
+    fprintf(stderr, "RTLDeviceInfoTy::Warp_Size: %d\n",
+            RTLDeviceInfoTy::Warp_Size);
+    fprintf(stderr, "RTLDeviceInfoTy::Max_WG_Size: %d\n",
+            RTLDeviceInfoTy::Max_WG_Size);
+    fprintf(stderr, "RTLDeviceInfoTy::Default_WG_Size: %d\n",
+            RTLDeviceInfoTy::Default_WG_Size);
+    fprintf(stderr, "thread_limit: %d\n", thread_limit);
+    fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup);
+    fprintf(stderr, "ConstWGSize: %d\n", ConstWGSize);
+  }
+  // check for thread_limit() clause
+  if (thread_limit > 0) {
+    threadsPerGroup = thread_limit;
+    DP("Setting threads per block to requested %d\n", thread_limit);
+    if (ExecutionMode == GENERIC) { // Add master warp for GENERIC
+      threadsPerGroup += RTLDeviceInfoTy::Warp_Size;
+      DP("Adding master wavefront: +%d threads\n", RTLDeviceInfoTy::Warp_Size);
+    }
+    if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max
+      threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size;
+      DP("Setting threads per block to maximum %d\n", threadsPerGroup);
+    }
+  }
+  // check flat_max_work_group_size attr here
+  if (threadsPerGroup > ConstWGSize) {
+    threadsPerGroup = ConstWGSize;
+    DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
+       threadsPerGroup);
+  }
+  if (print_kernel_trace == 4)
+    fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup);
+  DP("Preparing %d threads\n", threadsPerGroup);
+
+  // Set default num_groups (teams)
+  if (DeviceInfo.EnvTeamLimit > 0)
+    num_groups = (Max_Teams < DeviceInfo.EnvTeamLimit)
+                     ? Max_Teams
+                     : DeviceInfo.EnvTeamLimit;
+  else
+    num_groups = Max_Teams;
+  DP("Set default num of groups %d\n", num_groups);
+
+  if (print_kernel_trace == 4) {
+    fprintf(stderr, "num_groups: %d\n", num_groups);
+    fprintf(stderr, "num_teams: %d\n", num_teams);
+  }
+
+  // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
+  // This reduction is typical for default case (no thread_limit clause).
+  // or when user goes crazy with num_teams clause.
+  // FIXME: We cant distinguish between a constant or variable thread limit.
+  // So we only handle constant thread_limits.
+  if (threadsPerGroup >
+      RTLDeviceInfoTy::Default_WG_Size) //  256 < threadsPerGroup <= 1024
+    // Should we round threadsPerGroup up to nearest RTLDeviceInfoTy::Warp_Size
+    // here?
+    num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup;
+
+  // check for num_teams() clause
+  if (num_teams > 0) {
+    num_groups = (num_teams < num_groups) ? num_teams : num_groups;
+  }
+  if (print_kernel_trace == 4) {
+    fprintf(stderr, "num_groups: %d\n", num_groups);
+    fprintf(stderr, "DeviceInfo.EnvNumTeams %d\n", DeviceInfo.EnvNumTeams);
+    fprintf(stderr, "DeviceInfo.EnvTeamLimit %d\n", DeviceInfo.EnvTeamLimit);
+  }
+
+  if (DeviceInfo.EnvNumTeams > 0) {
+    num_groups = (DeviceInfo.EnvNumTeams < num_groups) ? DeviceInfo.EnvNumTeams
+                                                       : num_groups;
+    DP("Modifying teams based on EnvNumTeams %d\n", DeviceInfo.EnvNumTeams);
+  } else if (DeviceInfo.EnvTeamLimit > 0) {
+    num_groups = (DeviceInfo.EnvTeamLimit < num_groups)
+                     ? DeviceInfo.EnvTeamLimit
+                     : num_groups;
+    DP("Modifying teams based on EnvTeamLimit%d\n", DeviceInfo.EnvTeamLimit);
+  } else {
+    if (num_teams <= 0) {
+      if (loop_tripcount > 0) {
+        if (ExecutionMode == SPMD) {
+          // round up to the nearest integer
+          num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1;
+        } else {
+          num_groups = loop_tripcount;
+        }
+        DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
+           "threads per block %d\n",
+           num_groups, loop_tripcount, threadsPerGroup);
+      }
+    } else {
+      num_groups = num_teams;
+    }
+    if (num_groups > Max_Teams) {
+      num_groups = Max_Teams;
+      if (print_kernel_trace == 4)
+        fprintf(stderr, "Limiting num_groups %d to Max_Teams %d \n", num_groups,
+                Max_Teams);
+    }
+    if (num_groups > num_teams && num_teams > 0) {
+      num_groups = num_teams;
+      if (print_kernel_trace == 4)
+        fprintf(stderr, "Limiting num_groups %d to clause num_teams %d \n",
+                num_groups, num_teams);
+    }
+  }
+
+  // num_teams clause always honored, no matter what, unless DEFAULT is active.
+  if (num_teams > 0) {
+    num_groups = num_teams;
+    // Cap num_groups to EnvMaxTeamsDefault if set.
+    if (DeviceInfo.EnvMaxTeamsDefault > 0 &&
+        num_groups > DeviceInfo.EnvMaxTeamsDefault)
+      num_groups = DeviceInfo.EnvMaxTeamsDefault;
+  }
+  if (print_kernel_trace == 4) {
+    fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup);
+    fprintf(stderr, "num_groups: %d\n", num_groups);
+    fprintf(stderr, "loop_tripcount: %ld\n", loop_tripcount);
+  }
+  DP("Final %d num_groups and %d threadsPerGroup\n", num_groups,
+     threadsPerGroup);
+}
+
+static uint64_t acquire_available_packet_id(hsa_queue_t *queue) {
+  uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
+  bool full = true;
+  while (full) {
+    full =
+        packet_id >= (queue->size + hsa_queue_load_read_index_scacquire(queue));
+  }
+  return packet_id;
+}
+
+extern bool g_atmi_hostcall_required; // declared without header by atmi
+
+static int32_t __tgt_rtl_run_target_team_region_locked(
+    int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
+    ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams,
+    int32_t thread_limit, uint64_t loop_tripcount);
+
+int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
+                                         void **tgt_args,
+                                         ptrdiff_t *tgt_offsets,
+                                         int32_t arg_num, int32_t num_teams,
+                                         int32_t thread_limit,
+                                         uint64_t loop_tripcount) {
+
+  DeviceInfo.load_run_lock.lock_shared();
+  int32_t res = __tgt_rtl_run_target_team_region_locked(
+      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, num_teams,
+      thread_limit, loop_tripcount);
+
+  DeviceInfo.load_run_lock.unlock_shared();
+  return res;
+}
+
+int32_t __tgt_rtl_run_target_team_region_locked(
+    int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
+    ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams,
+    int32_t thread_limit, uint64_t loop_tripcount) {
+  // Set the context we are using
+  // update thread limit content in gpu memory if un-initialized or specified
+  // from host
+
+  DP("Run target team region thread_limit %d\n", thread_limit);
+
+  // All args are references.
+  std::vector<void *> args(arg_num);
+  std::vector<void *> ptrs(arg_num);
+
+  DP("Arg_num: %d\n", arg_num);
+  for (int32_t i = 0; i < arg_num; ++i) {
+    ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
+    args[i] = &ptrs[i];
+    DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i]));
+  }
+
+  KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
+
+  /*
+   * Set limit based on ThreadsPerGroup and GroupsPerDevice
+   */
+  int num_groups = 0;
+
+  int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size;
+
+  getLaunchVals(threadsPerGroup, num_groups, KernelInfo->ConstWGSize,
+                KernelInfo->ExecutionMode, DeviceInfo.EnvTeamLimit,
+                DeviceInfo.EnvNumTeams,
+                num_teams,      // From run_region arg
+                thread_limit,   // From run_region arg
+                loop_tripcount, // From run_region arg
+                KernelInfo->device_id);
+
+  if (print_kernel_trace >= 1)
+    // enum modes are SPMD, GENERIC, NONE 0,1,2
+    fprintf(stderr,
+            "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
+            "reqd:(%4dX%4d) n:%s\n",
+            device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
+            arg_num, num_groups, threadsPerGroup, num_teams, thread_limit,
+            KernelInfo->Name);
+
+  // Run on the device.
+  {
+    hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id];
+    uint64_t packet_id = acquire_available_packet_id(queue);
+
+    const uint32_t mask = queue->size - 1; // size is a power of 2
+    hsa_kernel_dispatch_packet_t *packet =
+        (hsa_kernel_dispatch_packet_t *)queue->base_address +
+        (packet_id & mask);
+
+    // packet->header is written last
+    packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+    packet->workgroup_size_x = threadsPerGroup;
+    packet->workgroup_size_y = 1;
+    packet->workgroup_size_z = 1;
+    packet->reserved0 = 0;
+    packet->grid_size_x = num_groups * threadsPerGroup;
+    packet->grid_size_y = 1;
+    packet->grid_size_z = 1;
+    packet->private_segment_size = 0;
+    packet->group_segment_size = 0;
+    packet->kernel_object = 0;
+    packet->kernarg_address = 0;     // use the block allocator
+    packet->reserved2 = 0;           // atmi writes id_ here
+    packet->completion_signal = {0}; // may want a pool of signals
+
+    std::string kernel_name = std::string(KernelInfo->Name);
+    {
+      assert(KernelInfoTable[device_id].find(kernel_name) !=
+             KernelInfoTable[device_id].end());
+      auto it = KernelInfoTable[device_id][kernel_name];
+      packet->kernel_object = it.kernel_object;
+      packet->private_segment_size = it.private_segment_size;
+      packet->group_segment_size = it.group_segment_size;
+      assert(arg_num == (int)it.num_args);
+    }
+
+    KernelArgPool *ArgPool = nullptr;
+    {
+      auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name));
+      if (it != KernelArgPoolMap.end()) {
+        ArgPool = (it->second).get();
+      }
+    }
+    if (!ArgPool) {
+      fprintf(stderr, "Warning: No ArgPool for %s on device %d\n",
+              KernelInfo->Name, device_id);
+    }
+    {
+      void *kernarg = nullptr;
+      if (ArgPool) {
+        assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *)));
+        kernarg = ArgPool->allocate(arg_num);
+      }
+      if (!kernarg) {
+        printf("Allocate kernarg failed\n");
+        exit(1);
+      }
+
+      // Copy explicit arguments
+      for (int i = 0; i < arg_num; i++) {
+        memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *));
+      }
+
+      // Initialize implicit arguments. ATMI seems to leave most fields
+      // uninitialized
+      atmi_implicit_args_t *impl_args =
+          reinterpret_cast<atmi_implicit_args_t *>(
+              static_cast<char *>(kernarg) + ArgPool->kernarg_segment_size);
+      memset(impl_args, 0,
+             sizeof(atmi_implicit_args_t)); // may not be necessary
+      impl_args->offset_x = 0;
+      impl_args->offset_y = 0;
+      impl_args->offset_z = 0;
+
+      // assign a hostcall buffer for the selected Q
+      if (g_atmi_hostcall_required) {
+        // hostrpc_assign_buffer is not thread safe, and this function is
+        // under a multiple reader lock, not a writer lock.
+        static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER;
+        pthread_mutex_lock(&hostcall_init_lock);
+        impl_args->hostcall_ptr = hostrpc_assign_buffer(
+            DeviceInfo.HSAAgents[device_id], queue, device_id);
+        pthread_mutex_unlock(&hostcall_init_lock);
+        if (!impl_args->hostcall_ptr) {
+          DP("hostrpc_assign_buffer failed, gpu would dereference null and "
+             "error\n");
+          return OFFLOAD_FAIL;
+        }
+      }
+
+      packet->kernarg_address = kernarg;
+    }
+
+    {
+      hsa_signal_t s = DeviceInfo.FreeSignalPool.pop();
+      if (s.handle == 0) {
+        printf("Failed to get signal instance\n");
+        exit(1);
+      }
+      packet->completion_signal = s;
+      hsa_signal_store_relaxed(packet->completion_signal, 1);
+    }
+
+    core::packet_store_release(
+        reinterpret_cast<uint32_t *>(packet),
+        core::create_header(HSA_PACKET_TYPE_KERNEL_DISPATCH, 0,
+                            ATMI_FENCE_SCOPE_SYSTEM, ATMI_FENCE_SCOPE_SYSTEM),
+        packet->setup);
+
+    hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
+
+    while (hsa_signal_wait_scacquire(packet->completion_signal,
+                                     HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
+                                     HSA_WAIT_STATE_BLOCKED) != 0)
+      ;
+
+    assert(ArgPool);
+    ArgPool->deallocate(packet->kernarg_address);
+    DeviceInfo.FreeSignalPool.push(packet->completion_signal);
+  }
+
+  DP("Kernel completed\n");
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
+                                    void **tgt_args, ptrdiff_t *tgt_offsets,
+                                    int32_t arg_num) {
+  // use one team and one thread
+  // fix thread num
+  int32_t team_num = 1;
+  int32_t thread_limit = 0; // use default
+  return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
+                                          tgt_offsets, arg_num, team_num,
+                                          thread_limit, 0);
+}
+
+int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
+                                          void *tgt_entry_ptr, void **tgt_args,
+                                          ptrdiff_t *tgt_offsets,
+                                          int32_t arg_num,
+                                          __tgt_async_info *async_info_ptr) {
+  assert(async_info_ptr && "async_info is nullptr");
+  initAsyncInfoPtr(async_info_ptr);
+
+  // use one team and one thread
+  // fix thread num
+  int32_t team_num = 1;
+  int32_t thread_limit = 0; // use default
+  return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
+                                          tgt_offsets, arg_num, team_num,
+                                          thread_limit, 0);
+}
+
+int32_t __tgt_rtl_synchronize(int32_t device_id,
+                              __tgt_async_info *async_info_ptr) {
+  assert(async_info_ptr && "async_info is nullptr");
+
+  // Cuda asserts that async_info_ptr->Queue is non-null, but this invariant
+  // is not ensured by devices.cpp for amdgcn
+  // assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
+  if (async_info_ptr->Queue) {
+    finiAsyncInfoPtr(async_info_ptr);
+  }
+  return OFFLOAD_SUCCESS;
+}
diff --git a/libomptarget/plugins/common/CMakeLists.txt b/libomptarget/plugins/common/CMakeLists.txt
new file mode 100644
index 000000000..ebc4795aa
--- /dev/null
+++ b/libomptarget/plugins/common/CMakeLists.txt
@@ -0,0 +1,14 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Common parts which can be used by all plugins
+#
+##===----------------------------------------------------------------------===##
+
+add_subdirectory(elf_common)
+add_subdirectory(MemoryManager)
diff --git a/libomptarget/plugins/common/MemoryManager/CMakeLists.txt b/libomptarget/plugins/common/MemoryManager/CMakeLists.txt
new file mode 100644
index 000000000..7f2e7c7c8
--- /dev/null
+++ b/libomptarget/plugins/common/MemoryManager/CMakeLists.txt
@@ -0,0 +1,11 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+
+add_library(MemoryManager INTERFACE)
+
+target_include_directories(MemoryManager INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/libomptarget/plugins/common/MemoryManager/MemoryManager.h b/libomptarget/plugins/common/MemoryManager/MemoryManager.h
new file mode 100644
index 000000000..6e00728a6
--- /dev/null
+++ b/libomptarget/plugins/common/MemoryManager/MemoryManager.h
@@ -0,0 +1,346 @@
+//===----------- MemoryManager.h - Target independent memory manager ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Target independent memory manager.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_MEMORYMANAGER_H
+#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_MEMORYMANAGER_H
+
+#include <cassert>
+#include <functional>
+#include <list>
+#include <mutex>
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+#include "Debug.h"
+#include "omptargetplugin.h"
+
+/// Base class of per-device allocator.
+class DeviceAllocatorTy {
+public:
+  virtual ~DeviceAllocatorTy() = default;
+
+  /// Allocate a memory of size \p Size . \p HstPtr is used to assist the
+  /// allocation.
+  virtual void *allocate(size_t Size, void *HstPtr) = 0;
+
+  /// Delete the pointer \p TgtPtr on the device
+  virtual int free(void *TgtPtr) = 0;
+};
+
+/// Class of memory manager. The memory manager is per-device by using
+/// per-device allocator. Therefore, each plugin using memory manager should
+/// have an allocator for each device.
+class MemoryManagerTy {
+  static constexpr const size_t BucketSize[] = {
+      0,       1U << 2, 1U << 3,  1U << 4,  1U << 5,  1U << 6, 1U << 7,
+      1U << 8, 1U << 9, 1U << 10, 1U << 11, 1U << 12, 1U << 13};
+
+  static constexpr const int NumBuckets =
+      sizeof(BucketSize) / sizeof(BucketSize[0]);
+
+  /// Find the previous number that is power of 2 given a number that is not
+  /// power of 2.
+  static size_t floorToPowerOfTwo(size_t Num) {
+    Num |= Num >> 1;
+    Num |= Num >> 2;
+    Num |= Num >> 4;
+    Num |= Num >> 8;
+    Num |= Num >> 16;
+#if INTPTR_MAX == INT64_MAX
+    Num |= Num >> 32;
+#elif INTPTR_MAX == INT32_MAX
+    // Do nothing with 32-bit
+#else
+#error Unsupported architecture
+#endif
+    Num += 1;
+    return Num >> 1;
+  }
+
+  /// Find a suitable bucket
+  static int findBucket(size_t Size) {
+    const size_t F = floorToPowerOfTwo(Size);
+
+    DP("findBucket: Size %zu is floored to %zu.\n", Size, F);
+
+    int L = 0, H = NumBuckets - 1;
+    while (H - L > 1) {
+      int M = (L + H) >> 1;
+      if (BucketSize[M] == F)
+        return M;
+      if (BucketSize[M] > F)
+        H = M - 1;
+      else
+        L = M;
+    }
+
+    assert(L >= 0 && L < NumBuckets && "L is out of range");
+
+    DP("findBucket: Size %zu goes to bucket %d\n", Size, L);
+
+    return L;
+  }
+
+  /// A structure stores the meta data of a target pointer
+  struct NodeTy {
+    /// Memory size
+    const size_t Size;
+    /// Target pointer
+    void *Ptr;
+
+    /// Constructor
+    NodeTy(size_t Size, void *Ptr) : Size(Size), Ptr(Ptr) {}
+  };
+
+  /// To make \p NodePtrTy ordered when they're put into \p std::multiset.
+  struct NodeCmpTy {
+    bool operator()(const NodeTy &LHS, const NodeTy &RHS) const {
+      return LHS.Size < RHS.Size;
+    }
+  };
+
+  /// A \p FreeList is a set of Nodes. We're using \p std::multiset here to make
+  /// the look up procedure more efficient.
+  using FreeListTy = std::multiset<std::reference_wrapper<NodeTy>, NodeCmpTy>;
+
+  /// A list of \p FreeListTy entries, each of which is a \p std::multiset of
+  /// Nodes whose size is less or equal to a specific bucket size.
+  std::vector<FreeListTy> FreeLists;
+  /// A list of mutex for each \p FreeListTy entry
+  std::vector<std::mutex> FreeListLocks;
+  /// A table to map from a target pointer to its node
+  std::unordered_map<void *, NodeTy> PtrToNodeTable;
+  /// The mutex for the table \p PtrToNodeTable
+  std::mutex MapTableLock;
+
+  /// The reference to a device allocator
+  DeviceAllocatorTy &DeviceAllocator;
+
+  /// The threshold to manage memory using memory manager. If the request size
+  /// is larger than \p SizeThreshold, the allocation will not be managed by the
+  /// memory manager.
+  size_t SizeThreshold = 1U << 13;
+
+  /// Request memory from target device
+  void *allocateOnDevice(size_t Size, void *HstPtr) const {
+    return DeviceAllocator.allocate(Size, HstPtr);
+  }
+
+  /// Deallocate data on device
+  int deleteOnDevice(void *Ptr) const { return DeviceAllocator.free(Ptr); }
+
+  /// This function is called when it tries to allocate memory on device but the
+  /// device returns out of memory. It will first free all memory in the
+  /// FreeList and try to allocate again.
+  void *freeAndAllocate(size_t Size, void *HstPtr) {
+    std::vector<void *> RemoveList;
+
+    // Deallocate all memory in FreeList
+    for (int I = 0; I < NumBuckets; ++I) {
+      FreeListTy &List = FreeLists[I];
+      std::lock_guard<std::mutex> Lock(FreeListLocks[I]);
+      if (List.empty())
+        continue;
+      for (const NodeTy &N : List) {
+        deleteOnDevice(N.Ptr);
+        RemoveList.push_back(N.Ptr);
+      }
+      FreeLists[I].clear();
+    }
+
+    // Remove all nodes in the map table which have been released
+    if (!RemoveList.empty()) {
+      std::lock_guard<std::mutex> LG(MapTableLock);
+      for (void *P : RemoveList)
+        PtrToNodeTable.erase(P);
+    }
+
+    // Try allocate memory again
+    return allocateOnDevice(Size, HstPtr);
+  }
+
+  /// The goal is to allocate memory on the device. It first tries to
+  /// allocate directly on the device. If a \p nullptr is returned, it might
+  /// be because the device is OOM. In that case, it will free all unused
+  /// memory and then try again.
+  void *allocateOrFreeAndAllocateOnDevice(size_t Size, void *HstPtr) {
+    void *TgtPtr = allocateOnDevice(Size, HstPtr);
+    // We cannot get memory from the device. It might be due to OOM. Let's
+    // free all memory in FreeLists and try again.
+    if (TgtPtr == nullptr) {
+      DP("Failed to get memory on device. Free all memory in FreeLists and "
+         "try again.\n");
+      TgtPtr = freeAndAllocate(Size, HstPtr);
+    }
+
+    if (TgtPtr == nullptr)
+      DP("Still cannot get memory on device probably because the device is "
+         "OOM.\n");
+
+    return TgtPtr;
+  }
+
+public:
+  /// Constructor. If \p Threshold is non-zero, then the default threshold will
+  /// be overwritten by \p Threshold.
+  MemoryManagerTy(DeviceAllocatorTy &DeviceAllocator, size_t Threshold = 0)
+      : FreeLists(NumBuckets), FreeListLocks(NumBuckets),
+        DeviceAllocator(DeviceAllocator) {
+    if (Threshold)
+      SizeThreshold = Threshold;
+  }
+
+  /// Destructor
+  ~MemoryManagerTy() {
+    for (auto Itr = PtrToNodeTable.begin(); Itr != PtrToNodeTable.end();
+         ++Itr) {
+      assert(Itr->second.Ptr && "nullptr in map table");
+      deleteOnDevice(Itr->second.Ptr);
+    }
+  }
+
+  /// Allocate memory of size \p Size from target device. \p HstPtr is used to
+  /// assist the allocation.
+  void *allocate(size_t Size, void *HstPtr) {
+    // If the size is zero, we will not bother the target device. Just return
+    // nullptr directly.
+    if (Size == 0)
+      return nullptr;
+
+    DP("MemoryManagerTy::allocate: size %zu with host pointer " DPxMOD ".\n",
+       Size, DPxPTR(HstPtr));
+
+    // If the size is greater than the threshold, allocate it directly from
+    // device.
+    if (Size > SizeThreshold) {
+      DP("%zu is greater than the threshold %zu. Allocate it directly from "
+         "device\n",
+         Size, SizeThreshold);
+      void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr);
+
+      DP("Got target pointer " DPxMOD ". Return directly.\n", DPxPTR(TgtPtr));
+
+      return TgtPtr;
+    }
+
+    NodeTy *NodePtr = nullptr;
+
+    // Try to get a node from FreeList
+    {
+      const int B = findBucket(Size);
+      FreeListTy &List = FreeLists[B];
+
+      NodeTy TempNode(Size, nullptr);
+      std::lock_guard<std::mutex> LG(FreeListLocks[B]);
+      const auto Itr = List.find(TempNode);
+
+      if (Itr != List.end()) {
+        NodePtr = &Itr->get();
+        List.erase(Itr);
+      }
+    }
+
+    if (NodePtr != nullptr)
+      DP("Find one node " DPxMOD " in the bucket.\n", DPxPTR(NodePtr));
+
+    // We cannot find a valid node in FreeLists. Let's allocate on device and
+    // create a node for it.
+    if (NodePtr == nullptr) {
+      DP("Cannot find a node in the FreeLists. Allocate on device.\n");
+      // Allocate one on device
+      void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr);
+
+      if (TgtPtr == nullptr)
+        return nullptr;
+
+      // Create a new node and add it into the map table
+      {
+        std::lock_guard<std::mutex> Guard(MapTableLock);
+        auto Itr = PtrToNodeTable.emplace(TgtPtr, NodeTy(Size, TgtPtr));
+        NodePtr = &Itr.first->second;
+      }
+
+      DP("Node address " DPxMOD ", target pointer " DPxMOD ", size %zu\n",
+         DPxPTR(NodePtr), DPxPTR(TgtPtr), Size);
+    }
+
+    assert(NodePtr && "NodePtr should not be nullptr at this point");
+
+    return NodePtr->Ptr;
+  }
+
+  /// Deallocate memory pointed by \p TgtPtr
+  int free(void *TgtPtr) {
+    DP("MemoryManagerTy::free: target memory " DPxMOD ".\n", DPxPTR(TgtPtr));
+
+    NodeTy *P = nullptr;
+
+    // Look it up into the table
+    {
+      std::lock_guard<std::mutex> G(MapTableLock);
+      auto Itr = PtrToNodeTable.find(TgtPtr);
+
+      // We don't remove the node from the map table because the map does not
+      // change.
+      if (Itr != PtrToNodeTable.end())
+        P = &Itr->second;
+    }
+
+    // The memory is not managed by the manager
+    if (P == nullptr) {
+      DP("Cannot find its node. Delete it on device directly.\n");
+      return deleteOnDevice(TgtPtr);
+    }
+
+    // Insert the node to the free list
+    const int B = findBucket(P->Size);
+
+    DP("Found its node " DPxMOD ". Insert it to bucket %d.\n", DPxPTR(P), B);
+
+    {
+      std::lock_guard<std::mutex> G(FreeListLocks[B]);
+      FreeLists[B].insert(*P);
+    }
+
+    return OFFLOAD_SUCCESS;
+  }
+
+  /// Get the size threshold from the environment variable
+  /// \p LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD . Returns a <tt>
+  /// std::pair<size_t, bool> </tt> where the first element represents the
+  /// threshold and the second element represents whether user disables memory
+  /// manager explicitly by setting the var to 0. If user doesn't specify
+  /// anything, returns <0, true>.
+  static std::pair<size_t, bool> getSizeThresholdFromEnv() {
+    size_t Threshold = 0;
+
+    if (const char *Env =
+            std::getenv("LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD")) {
+      Threshold = std::stoul(Env);
+      if (Threshold == 0) {
+        DP("Disabled memory manager as user set "
+           "LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD=0.\n");
+        return std::make_pair(0, false);
+      }
+    }
+
+    return std::make_pair(Threshold, true);
+  }
+};
+
+// GCC still cannot handle the static data member like Clang so we still need
+// this part.
+constexpr const size_t MemoryManagerTy::BucketSize[];
+constexpr const int MemoryManagerTy::NumBuckets;
+
+#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_MEMORYMANAGER_H
diff --git a/libomptarget/plugins/common/elf_common/CMakeLists.txt b/libomptarget/plugins/common/elf_common/CMakeLists.txt
new file mode 100644
index 000000000..7cad0a0a1
--- /dev/null
+++ b/libomptarget/plugins/common/elf_common/CMakeLists.txt
@@ -0,0 +1,15 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Common ELF functionality for target plugins
+#
+##===----------------------------------------------------------------------===##
+
+add_library(elf_common INTERFACE)
+
+target_include_directories(elf_common INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/libomptarget/plugins/common/elf_common.c b/libomptarget/plugins/common/elf_common/elf_common.h
similarity index 62%
rename from libomptarget/plugins/common/elf_common.c
rename to libomptarget/plugins/common/elf_common/elf_common.h
index 60e1e4fda..75994238b 100644
--- a/libomptarget/plugins/common/elf_common.c
+++ b/libomptarget/plugins/common/elf_common/elf_common.h
@@ -1,4 +1,4 @@
-//===-- elf_common.c - Common ELF functionality -------------------*- C -*-===//
+//===-- elf_common.h - Common ELF functionality -------------------*- C -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -13,9 +13,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#if !(defined(_OMPTARGET_H_) && defined(DP))
-#error Include elf_common.c in the plugin source AFTER omptarget.h has been\
- included and macro DP(...) has been defined.
+#if !(defined(_OMPTARGET_DEBUG_H))
+#error Include elf_common.h in the plugin source AFTER Debug.h has\
+ been included.
 #endif
 
 #include <elf.h>
@@ -23,7 +23,7 @@
 
 // Check whether an image is valid for execution on target_id
 static inline int32_t elf_check_machine(__tgt_device_image *image,
-    uint16_t target_id) {
+                                        uint16_t target_id) {
 
   // Is the library version incompatible with the header file?
   if (elf_version(EV_CURRENT) == EV_NONE) {
@@ -45,6 +45,7 @@ static inline int32_t elf_check_machine(__tgt_device_image *image,
   // Check if ELF is the right kind.
   if (elf_kind(e) != ELF_K_ELF) {
     DP("Unexpected ELF type!\n");
+    elf_end(e);
     return 0;
   }
   Elf64_Ehdr *eh64 = elf64_getehdr(e);
@@ -70,3 +71,41 @@ static inline int32_t elf_check_machine(__tgt_device_image *image,
   elf_end(e);
   return MachineID == target_id;
 }
+
+static inline int32_t elf_is_dynamic(__tgt_device_image *image) {
+
+  char *img_begin = (char *)image->ImageStart;
+  char *img_end = (char *)image->ImageEnd;
+  size_t img_size = img_end - img_begin;
+
+  // Obtain elf handler
+  Elf *e = elf_memory(img_begin, img_size);
+  if (!e) {
+    DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
+    return 0;
+  }
+
+  Elf64_Ehdr *eh64 = elf64_getehdr(e);
+  Elf32_Ehdr *eh32 = elf32_getehdr(e);
+
+  if (!eh64 && !eh32) {
+    DP("Unable to get machine ID from ELF file!\n");
+    elf_end(e);
+    return 0;
+  }
+
+  uint16_t Type;
+  if (eh64 && !eh32)
+    Type = eh64->e_type;
+  else if (eh32 && !eh64)
+    Type = eh32->e_type;
+  else {
+    DP("Ambiguous ELF header!\n");
+    elf_end(e);
+    return 0;
+  }
+
+  elf_end(e);
+  DP("ELF Type: %d\n", Type);
+  return Type == ET_DYN;
+}
diff --git a/libomptarget/plugins/cuda/CMakeLists.txt b/libomptarget/plugins/cuda/CMakeLists.txt
index 06c3dd948..7c605a0c3 100644
--- a/libomptarget/plugins/cuda/CMakeLists.txt
+++ b/libomptarget/plugins/cuda/CMakeLists.txt
@@ -1,16 +1,16 @@
 ##===----------------------------------------------------------------------===##
-# 
+#
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# 
+#
 ##===----------------------------------------------------------------------===##
 #
 # Build a plugin for a CUDA machine if available.
 #
 ##===----------------------------------------------------------------------===##
-if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
-  libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64 or ppc64le hosts.")
+if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
+  libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.")
   return()
 elseif (NOT LIBOMPTARGET_DEP_LIBELF_FOUND)
   libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.")
@@ -31,15 +31,18 @@ add_definitions(-DTARGET_NAME=CUDA)
 include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
 include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS})
 
-add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
+add_library(bolt-omptarget.rtl.cuda SHARED src/rtl.cpp)
 
 # Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+install(TARGETS bolt-omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
 
-target_link_libraries(omptarget.rtl.cuda
+target_link_libraries(bolt-omptarget.rtl.cuda
+  elf_common
+  MemoryManager
   ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES}
   ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
-  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
+  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
+  "-Wl,-z,defs")
 
 # Report to the parent scope that we are building a plugin for CUDA.
-set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda" PARENT_SCOPE)
+set(LIBBOLTTARGET_SYSTEM_TARGETS "${LIBBOLTTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda" PARENT_SCOPE)
diff --git a/libomptarget/plugins/cuda/src/rtl.cpp b/libomptarget/plugins/cuda/src/rtl.cpp
index 4a7264401..e4ac1e082 100644
--- a/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/libomptarget/plugins/cuda/src/rtl.cpp
@@ -14,42 +14,53 @@
 #include <cstddef>
 #include <cuda.h>
 #include <list>
+#include <memory>
+#include <mutex>
 #include <string>
 #include <vector>
 
+#include "Debug.h"
 #include "omptargetplugin.h"
 
-#ifndef TARGET_NAME
 #define TARGET_NAME CUDA
-#endif
+#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
 
-#ifdef OMPTARGET_DEBUG
-static int DebugLevel = 0;
-
-#define GETNAME2(name) #name
-#define GETNAME(name) GETNAME2(name)
-#define DP(...) \
-  do { \
-    if (DebugLevel > 0) { \
-      DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
-    } \
-  } while (false)
+#include "MemoryManager.h"
 
 // Utility for retrieving and printing CUDA error string.
-#define CUDA_ERR_STRING(err) \
-  do { \
-    if (DebugLevel > 0) { \
-      const char *errStr; \
-      cuGetErrorString(err, &errStr); \
-      DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", "CUDA error is: %s\n", errStr); \
-    } \
+#ifdef OMPTARGET_DEBUG
+#define CUDA_ERR_STRING(err)                                                   \
+  do {                                                                         \
+    if (getDebugLevel() > 0) {                                                 \
+      const char *errStr = nullptr;                                            \
+      CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
+      if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
+        REPORT("Unrecognized CUDA error code: %d\n", err);                     \
+      else if (errStr_status == CUDA_SUCCESS)                                  \
+        REPORT("CUDA error is: %s\n", errStr);                                 \
+      else {                                                                   \
+        REPORT("Unresolved CUDA error code: %d\n", err);                       \
+        REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
+               errStr_status);                                                 \
+      }                                                                        \
+    } else {                                                                   \
+      const char *errStr = nullptr;                                            \
+      CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
+      if (errStr_status == CUDA_SUCCESS)                                       \
+        REPORT("%s \n", errStr);                                               \
+    }                                                                          \
   } while (false)
 #else // OMPTARGET_DEBUG
-#define DP(...) {}
-#define CUDA_ERR_STRING(err) {}
+#define CUDA_ERR_STRING(err)                                                   \
+  do {                                                                         \
+    const char *errStr = nullptr;                                              \
+    CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
+    if (errStr_status == CUDA_SUCCESS)                                         \
+      REPORT("%s \n", errStr);                                                 \
+  } while (false)
 #endif // OMPTARGET_DEBUG
 
-#include "../../common/elf_common.c"
+#include "elf_common.h"
 
 /// Keep entries table per device.
 struct FuncOrGblEntryTy {
@@ -59,12 +70,12 @@ struct FuncOrGblEntryTy {
 
 enum ExecutionModeType {
   SPMD, // constructors, destructors,
-        // combined constructs (`teams distribute parallel for [simd]`)
+  // combined constructs (`teams distribute parallel for [simd]`)
   GENERIC, // everything else
   NONE
 };
 
-/// Use a single entity to encode a kernel and a set of flags
+/// Use a single entity to encode a kernel and a set of flags.
 struct KernelTy {
   CUfunction Func;
 
@@ -73,718 +84,1161 @@ struct KernelTy {
   // 1 - Generic mode (with master warp)
   int8_t ExecutionMode;
 
+  /// Maximal number of threads per block for this kernel.
+  int MaxThreadsPerBlock = 0;
+
   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
       : Func(_Func), ExecutionMode(_ExecutionMode) {}
 };
 
-/// Device envrionment data
-/// Manually sync with the deviceRTL side for now, move to a dedicated header file later.
+/// Device environment data
+/// Manually sync with the deviceRTL side for now, move to a dedicated header
+/// file later.
 struct omptarget_device_environmentTy {
   int32_t debug_level;
 };
 
-/// List that contains all the kernels.
-/// FIXME: we may need this to be per device and per library.
-std::list<KernelTy> KernelsList;
+namespace {
+bool checkResult(CUresult Err, const char *ErrMsg) {
+  if (Err == CUDA_SUCCESS)
+    return true;
 
-/// Class containing all the device information.
-class RTLDeviceInfoTy {
-  std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
+  REPORT("%s", ErrMsg);
+  CUDA_ERR_STRING(Err);
+  return false;
+}
 
-public:
-  int NumberOfDevices;
-  std::vector<CUmodule> Modules;
-  std::vector<CUcontext> Contexts;
+int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
+               CUstream Stream) {
+  CUresult Err =
+      cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
 
-  // Device properties
-  std::vector<int> ThreadsPerBlock;
-  std::vector<int> BlocksPerGrid;
-  std::vector<int> WarpSize;
+  if (Err != CUDA_SUCCESS) {
+    REPORT("Error when copying data from device to device. Pointers: src "
+           "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
+           DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
+    CUDA_ERR_STRING(Err);
+    return OFFLOAD_FAIL;
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+// Structure contains per-device data
+struct DeviceDataTy {
+  /// List that contains all the kernels.
+  std::list<KernelTy> KernelsList;
 
+  std::list<FuncOrGblEntryTy> FuncGblEntries;
+
+  CUcontext Context = nullptr;
+  // Device properties
+  int ThreadsPerBlock = 0;
+  int BlocksPerGrid = 0;
+  int WarpSize = 0;
   // OpenMP properties
-  std::vector<int> NumTeams;
-  std::vector<int> NumThreads;
+  int NumTeams = 0;
+  int NumThreads = 0;
+};
+
+class StreamManagerTy {
+  int NumberOfDevices;
+  // The initial size of stream pool
+  int EnvNumInitialStreams;
+  // Per-device stream mutex
+  std::vector<std::unique_ptr<std::mutex>> StreamMtx;
+  // Per-device stream Id indicates the next available stream in the pool
+  std::vector<int> NextStreamId;
+  // Per-device stream pool
+  std::vector<std::vector<CUstream>> StreamPool;
+  // Reference to per-device data
+  std::vector<DeviceDataTy> &DeviceData;
+
+  // If there is no CUstream left in the pool, we will resize the pool to
+  // allocate more CUstream. This function should be called with device mutex,
+  // and we do not resize to smaller one.
+  void resizeStreamPool(const int DeviceId, const size_t NewSize) {
+    std::vector<CUstream> &Pool = StreamPool[DeviceId];
+    const size_t CurrentSize = Pool.size();
+    assert(NewSize > CurrentSize && "new size is not larger than current size");
+
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
+      // We will return if cannot switch to the right context in case of
+      // creating bunch of streams that are not corresponding to the right
+      // device. The offloading will fail later because selected CUstream is
+      // nullptr.
+      return;
+    }
+
+    Pool.resize(NewSize, nullptr);
 
-  // OpenMP Environment properties
+    for (size_t I = CurrentSize; I < NewSize; ++I) {
+      checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
+                  "Error returned from cuStreamCreate\n");
+    }
+  }
+
+public:
+  StreamManagerTy(const int NumberOfDevices,
+                  std::vector<DeviceDataTy> &DeviceData)
+      : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
+        DeviceData(DeviceData) {
+    StreamPool.resize(NumberOfDevices);
+    NextStreamId.resize(NumberOfDevices);
+    StreamMtx.resize(NumberOfDevices);
+
+    if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
+      EnvNumInitialStreams = std::stoi(EnvStr);
+
+    // Initialize the next stream id
+    std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
+
+    // Initialize stream mutex
+    for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
+      Ptr = std::make_unique<std::mutex>();
+  }
+
+  ~StreamManagerTy() {
+    // Destroy streams
+    for (int I = 0; I < NumberOfDevices; ++I) {
+      checkResult(cuCtxSetCurrent(DeviceData[I].Context),
+                  "Error returned from cuCtxSetCurrent\n");
+
+      for (CUstream &S : StreamPool[I]) {
+        if (S)
+          checkResult(cuStreamDestroy(S),
+                      "Error returned from cuStreamDestroy\n");
+      }
+    }
+  }
+
+  // Get a CUstream from pool. Per-device next stream id always points to the
+  // next available CUstream. That means, CUstreams [0, id-1] have been
+  // assigned, and [id,] are still available. If there is no CUstream left, we
+  // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
+  // the id will increase one.
+  // xxxxxs+++++++++
+  //      ^
+  //      id
+  // After assignment, the pool becomes the following and s is assigned.
+  // xxxxxs+++++++++
+  //       ^
+  //       id
+  CUstream getStream(const int DeviceId) {
+    const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
+    int &Id = NextStreamId[DeviceId];
+    // No CUstream left in the pool, we need to request from CUDA RT
+    if (Id == StreamPool[DeviceId].size()) {
+      // By default we double the stream pool every time
+      resizeStreamPool(DeviceId, Id * 2);
+    }
+    return StreamPool[DeviceId][Id++];
+  }
+
+  // Return a CUstream back to pool. As mentioned above, per-device next
+  // stream is always points to the next available CUstream, so when we return
+  // a CUstream, we need to first decrease the id, and then copy the CUstream
+  // back.
+  // It is worth noting that, the order of streams return might be different
+  // from that they're assigned, that saying, at some point, there might be
+  // two identical CUstreams.
+  // xxax+a+++++
+  //     ^
+  //     id
+  // However, it doesn't matter, because they're always on the two sides of
+  // id. The left one will in the end be overwritten by another CUstream.
+  // Therefore, after several execution, the order of pool might be different
+  // from its initial state.
+  void returnStream(const int DeviceId, CUstream Stream) {
+    const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
+    int &Id = NextStreamId[DeviceId];
+    assert(Id > 0 && "Wrong stream ID");
+    StreamPool[DeviceId][--Id] = Stream;
+  }
+
+  bool initializeDeviceStreamPool(const int DeviceId) {
+    assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
+
+    resizeStreamPool(DeviceId, EnvNumInitialStreams);
+
+    // Check the size of stream pool
+    if (StreamPool[DeviceId].size() != EnvNumInitialStreams)
+      return false;
+
+    // Check whether each stream is valid
+    for (CUstream &S : StreamPool[DeviceId])
+      if (!S)
+        return false;
+
+    return true;
+  }
+};
+
+class DeviceRTLTy {
+  int NumberOfDevices;
+  // OpenMP environment properties
   int EnvNumTeams;
   int EnvTeamLimit;
-
-  // OpenMP Requires Flags
+  // OpenMP requires flags
   int64_t RequiresFlags;
 
-  //static int EnvNumThreads;
-  static const int HardTeamLimit = 1<<16; // 64k
-  static const int HardThreadLimit = 1024;
-  static const int DefaultNumTeams = 128;
-  static const int DefaultNumThreads = 128;
+  static constexpr const int HardTeamLimit = 1U << 16U; // 64k
+  static constexpr const int HardThreadLimit = 1024;
+  static constexpr const int DefaultNumTeams = 128;
+  static constexpr const int DefaultNumThreads = 128;
 
-  // Record entry point associated with device
-  void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
-    assert(device_id < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+  std::unique_ptr<StreamManagerTy> StreamManager;
+  std::vector<DeviceDataTy> DeviceData;
+  std::vector<CUmodule> Modules;
 
-    E.Entries.push_back(entry);
-  }
+  /// A class responsible for interacting with device native runtime library to
+  /// allocate and free memory.
+  class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
+    const int DeviceId;
+    const std::vector<DeviceDataTy> &DeviceData;
+
+  public:
+    CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
+        : DeviceId(DeviceId), DeviceData(DeviceData) {}
+
+    void *allocate(size_t Size, void *) override {
+      if (Size == 0)
+        return nullptr;
+
+      CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+      if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+        return nullptr;
 
-  // Return true if the entry is associated with device
-  bool findOffloadEntry(int32_t device_id, void *addr) {
-    assert(device_id < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+      CUdeviceptr DevicePtr;
+      Err = cuMemAlloc(&DevicePtr, Size);
+      if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
+        return nullptr;
 
-    for (auto &it : E.Entries) {
-      if (it.addr == addr)
-        return true;
+      return (void *)DevicePtr;
     }
 
-    return false;
+    int free(void *TgtPtr) override {
+      CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+      if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+        return OFFLOAD_FAIL;
+
+      Err = cuMemFree((CUdeviceptr)TgtPtr);
+      if (!checkResult(Err, "Error returned from cuMemFree\n"))
+        return OFFLOAD_FAIL;
+
+      return OFFLOAD_SUCCESS;
+    }
+  };
+
+  /// A vector of device allocators
+  std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
+
+  /// A vector of memory managers. Since the memory manager is non-copyable and
+  // non-removable, we wrap them into std::unique_ptr.
+  std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
+
+  /// Whether use memory manager
+  bool UseMemoryManager = true;
+
+  // Record entry point associated with device
+  void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
+    FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
+    E.Entries.push_back(entry);
   }
 
-  // Return the pointer to the target entries table
-  __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
-    assert(device_id < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+  // Return a pointer to the entry associated with the pointer
+  const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
+                                             const void *Addr) const {
+    for (const __tgt_offload_entry &Itr :
+         DeviceData[DeviceId].FuncGblEntries.back().Entries)
+      if (Itr.addr == Addr)
+        return &Itr;
 
-    int32_t size = E.Entries.size();
+    return nullptr;
+  }
 
-    // Table is empty
-    if (!size)
-      return 0;
+  // Return the pointer to the target entries table
+  __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
+    FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
 
-    __tgt_offload_entry *begin = &E.Entries[0];
-    __tgt_offload_entry *end = &E.Entries[size - 1];
+    if (E.Entries.empty())
+      return nullptr;
 
     // Update table info according to the entries and return the pointer
-    E.Table.EntriesBegin = begin;
-    E.Table.EntriesEnd = ++end;
+    E.Table.EntriesBegin = E.Entries.data();
+    E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
 
     return &E.Table;
   }
 
   // Clear entries table for a device
-  void clearOffloadEntriesTable(int32_t device_id) {
-    assert(device_id < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncGblEntries[device_id].emplace_back();
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+  void clearOffloadEntriesTable(const int DeviceId) {
+    DeviceData[DeviceId].FuncGblEntries.emplace_back();
+    FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
     E.Entries.clear();
-    E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
+    E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
   }
 
-  RTLDeviceInfoTy() {
-#ifdef OMPTARGET_DEBUG
-    if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
-      DebugLevel = std::stoi(envStr);
-    }
-#endif // OMPTARGET_DEBUG
+  CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
+    assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+
+    if (!AsyncInfoPtr->Queue)
+      AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId);
+
+    return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
+  }
+
+public:
+  // This class should not be copied
+  DeviceRTLTy(const DeviceRTLTy &) = delete;
+  DeviceRTLTy(DeviceRTLTy &&) = delete;
+
+  DeviceRTLTy()
+      : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
+        RequiresFlags(OMP_REQ_UNDEFINED) {
 
     DP("Start initializing CUDA\n");
 
-    CUresult err = cuInit(0);
-    if (err != CUDA_SUCCESS) {
-      DP("Error when initializing CUDA\n");
-      CUDA_ERR_STRING(err);
+    CUresult Err = cuInit(0);
+    if (!checkResult(Err, "Error returned from cuInit\n")) {
       return;
     }
 
-    NumberOfDevices = 0;
-
-    err = cuDeviceGetCount(&NumberOfDevices);
-    if (err != CUDA_SUCCESS) {
-      DP("Error when getting CUDA device count\n");
-      CUDA_ERR_STRING(err);
+    Err = cuDeviceGetCount(&NumberOfDevices);
+    if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
       return;
-    }
 
     if (NumberOfDevices == 0) {
       DP("There are no devices supporting CUDA.\n");
       return;
     }
 
-    FuncGblEntries.resize(NumberOfDevices);
-    Contexts.resize(NumberOfDevices);
-    ThreadsPerBlock.resize(NumberOfDevices);
-    BlocksPerGrid.resize(NumberOfDevices);
-    WarpSize.resize(NumberOfDevices);
-    NumTeams.resize(NumberOfDevices);
-    NumThreads.resize(NumberOfDevices);
+    DeviceData.resize(NumberOfDevices);
 
     // Get environment variables regarding teams
-    char *envStr = getenv("OMP_TEAM_LIMIT");
-    if (envStr) {
+    if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
       // OMP_TEAM_LIMIT has been set
-      EnvTeamLimit = std::stoi(envStr);
+      EnvTeamLimit = std::stoi(EnvStr);
       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
-    } else {
-      EnvTeamLimit = -1;
     }
-    envStr = getenv("OMP_NUM_TEAMS");
-    if (envStr) {
+    if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
       // OMP_NUM_TEAMS has been set
-      EnvNumTeams = std::stoi(envStr);
+      EnvNumTeams = std::stoi(EnvStr);
       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
-    } else {
-      EnvNumTeams = -1;
     }
 
-    // Default state.
-    RequiresFlags = OMP_REQ_UNDEFINED;
+    StreamManager =
+        std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
+
+    for (int I = 0; I < NumberOfDevices; ++I)
+      DeviceAllocators.emplace_back(I, DeviceData);
+
+    // Get the size threshold from environment variable
+    std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
+    UseMemoryManager = Res.second;
+    size_t MemoryManagerThreshold = Res.first;
+
+    if (UseMemoryManager)
+      for (int I = 0; I < NumberOfDevices; ++I)
+        MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
+            DeviceAllocators[I], MemoryManagerThreshold));
   }
 
-  ~RTLDeviceInfoTy() {
-    // Close modules
-    for (auto &module : Modules)
-      if (module) {
-        CUresult err = cuModuleUnload(module);
-        if (err != CUDA_SUCCESS) {
-          DP("Error when unloading CUDA module\n");
-          CUDA_ERR_STRING(err);
-        }
+  ~DeviceRTLTy() {
+    // We first destruct memory managers in case that its dependent data are
+    // destroyed before it.
+    for (auto &M : MemoryManagers)
+      M.release();
+
+    StreamManager = nullptr;
+
+    for (CUmodule &M : Modules)
+      // Close module
+      if (M)
+        checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
+
+    for (DeviceDataTy &D : DeviceData) {
+      // Destroy context
+      if (D.Context) {
+        checkResult(cuCtxSetCurrent(D.Context),
+                    "Error returned from cuCtxSetCurrent\n");
+        CUdevice Device;
+        checkResult(cuCtxGetDevice(&Device),
+                    "Error returned from cuCtxGetDevice\n");
+        checkResult(cuDevicePrimaryCtxRelease(Device),
+                    "Error returned from cuDevicePrimaryCtxRelease\n");
       }
+    }
+  }
 
-    // Destroy contexts
-    for (auto &ctx : Contexts)
-      if (ctx) {
-        CUresult err = cuCtxDestroy(ctx);
-        if (err != CUDA_SUCCESS) {
-          DP("Error when destroying CUDA context\n");
-          CUDA_ERR_STRING(err);
-        }
-      }
+  // Check whether a given DeviceId is valid
+  bool isValidDeviceId(const int DeviceId) const {
+    return DeviceId >= 0 && DeviceId < NumberOfDevices;
   }
-};
 
-static RTLDeviceInfoTy DeviceInfo;
+  int getNumOfDevices() const { return NumberOfDevices; }
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+  void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
 
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
-  return elf_check_machine(image, 190); // EM_CUDA = 190.
-}
+  int initDevice(const int DeviceId) {
+    CUdevice Device;
 
-int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
+    DP("Getting device %d\n", DeviceId);
+    CUresult Err = cuDeviceGet(&Device, DeviceId);
+    if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
+      return OFFLOAD_FAIL;
 
-int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
-  DP("Init requires flags to %ld\n", RequiresFlags);
-  DeviceInfo.RequiresFlags = RequiresFlags;
-  return RequiresFlags;
-}
+    // Query the current flags of the primary context and set its flags if
+    // it is inactive
+    unsigned int FormerPrimaryCtxFlags = 0;
+    int FormerPrimaryCtxIsActive = 0;
+    Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
+                                     &FormerPrimaryCtxIsActive);
+    if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
+      return OFFLOAD_FAIL;
 
-int32_t __tgt_rtl_init_device(int32_t device_id) {
+    if (FormerPrimaryCtxIsActive) {
+      DP("The primary context is active, no change to its flags\n");
+      if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
+          CU_CTX_SCHED_BLOCKING_SYNC)
+        DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
+    } else {
+      DP("The primary context is inactive, set its flags to "
+         "CU_CTX_SCHED_BLOCKING_SYNC\n");
+      Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
+      if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
+        return OFFLOAD_FAIL;
+    }
 
-  CUdevice cuDevice;
-  DP("Getting device %d\n", device_id);
-  CUresult err = cuDeviceGet(&cuDevice, device_id);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when getting CUDA device with id = %d\n", device_id);
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
+    // Retain the per device primary context and save it to use whenever this
+    // device is selected.
+    Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
+    if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
+      return OFFLOAD_FAIL;
+
+    Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return OFFLOAD_FAIL;
+
+    // Initialize stream pool
+    if (!StreamManager->initializeDeviceStreamPool(DeviceId))
+      return OFFLOAD_FAIL;
+
+    // Query attributes to determine number of threads/block and blocks/grid.
+    int MaxGridDimX;
+    Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+                               Device);
+    if (Err != CUDA_SUCCESS) {
+      DP("Error getting max grid dimension, use default value %d\n",
+         DeviceRTLTy::DefaultNumTeams);
+      DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
+    } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
+      DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
+      DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
+    } else {
+      DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
+         "at the hard limit\n",
+         MaxGridDimX, DeviceRTLTy::HardTeamLimit);
+      DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
+    }
 
-  // Create the context and save it to use whenever this device is selected.
-  err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
-                    cuDevice);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when creating a CUDA context\n");
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
+    // We are only exploiting threads along the x axis.
+    int MaxBlockDimX;
+    Err = cuDeviceGetAttribute(&MaxBlockDimX,
+                               CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
+    if (Err != CUDA_SUCCESS) {
+      DP("Error getting max block dimension, use default value %d\n",
+         DeviceRTLTy::DefaultNumThreads);
+      DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
+    } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
+      DP("Using %d CUDA threads per block\n", MaxBlockDimX);
+      DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
+    } else {
+      DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
+         "capping at the hard limit\n",
+         MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
+      DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
+    }
 
-  // Query attributes to determine number of threads/block and blocks/grid.
-  int maxGridDimX;
-  err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
-                             cuDevice);
-  if (err != CUDA_SUCCESS) {
-    DP("Error getting max grid dimension, use default\n");
-    DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
-  } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) {
-    DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX;
-    DP("Using %d CUDA blocks per grid\n", maxGridDimX);
-  } else {
-    DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
-    DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
-       "at the hard limit\n",
-       maxGridDimX, RTLDeviceInfoTy::HardTeamLimit);
-  }
+    // Get and set warp size
+    int WarpSize;
+    Err =
+        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
+    if (Err != CUDA_SUCCESS) {
+      DP("Error getting warp size, assume default value 32\n");
+      DeviceData[DeviceId].WarpSize = 32;
+    } else {
+      DP("Using warp size %d\n", WarpSize);
+      DeviceData[DeviceId].WarpSize = WarpSize;
+    }
 
-  // We are only exploiting threads along the x axis.
-  int maxBlockDimX;
-  err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
-                             cuDevice);
-  if (err != CUDA_SUCCESS) {
-    DP("Error getting max block dimension, use default\n");
-    DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
-  } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) {
-    DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX;
-    DP("Using %d CUDA threads per block\n", maxBlockDimX);
-  } else {
-    DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
-    DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping"
-       "at the hard limit\n",
-       maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit);
-  }
+    // Adjust teams to the env variables
+    if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
+      DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
+         EnvTeamLimit);
+      DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
+    }
 
-  int warpSize;
-  err =
-      cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice);
-  if (err != CUDA_SUCCESS) {
-    DP("Error getting warp size, assume default\n");
-    DeviceInfo.WarpSize[device_id] = 32;
-  } else {
-    DeviceInfo.WarpSize[device_id] = warpSize;
-  }
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Device supports up to %d CUDA blocks and %d threads with a "
+         "warp size of %d\n",
+         DeviceData[DeviceId].BlocksPerGrid,
+         DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
+
+    // Set default number of teams
+    if (EnvNumTeams > 0) {
+      DP("Default number of teams set according to environment %d\n",
+         EnvNumTeams);
+      DeviceData[DeviceId].NumTeams = EnvNumTeams;
+    } else {
+      DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
+      DP("Default number of teams set according to library's default %d\n",
+         DeviceRTLTy::DefaultNumTeams);
+    }
 
-  // Adjust teams to the env variables
-  if (DeviceInfo.EnvTeamLimit > 0 &&
-      DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
-    DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
-    DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
-        DeviceInfo.EnvTeamLimit);
-  }
+    if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
+      DP("Default number of teams exceeds device limit, capping at %d\n",
+         DeviceData[DeviceId].BlocksPerGrid);
+      DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
+    }
 
-  DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
-     DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],
-     DeviceInfo.WarpSize[device_id]);
-
-  // Set default number of teams
-  if (DeviceInfo.EnvNumTeams > 0) {
-    DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
-    DP("Default number of teams set according to environment %d\n",
-        DeviceInfo.EnvNumTeams);
-  } else {
-    DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
-    DP("Default number of teams set according to library's default %d\n",
-        RTLDeviceInfoTy::DefaultNumTeams);
-  }
-  if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
-    DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];
-    DP("Default number of teams exceeds device limit, capping at %d\n",
-        DeviceInfo.BlocksPerGrid[device_id]);
-  }
+    // Set default number of threads
+    DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
+    DP("Default number of threads set according to library's default %d\n",
+       DeviceRTLTy::DefaultNumThreads);
+    if (DeviceData[DeviceId].NumThreads >
+        DeviceData[DeviceId].ThreadsPerBlock) {
+      DP("Default number of threads exceeds device limit, capping at %d\n",
+         DeviceData[DeviceId].ThreadsPerBlock);
+      DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
+    }
 
-  // Set default number of threads
-  DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
-  DP("Default number of threads set according to library's default %d\n",
-          RTLDeviceInfoTy::DefaultNumThreads);
-  if (DeviceInfo.NumThreads[device_id] >
-      DeviceInfo.ThreadsPerBlock[device_id]) {
-    DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
-    DP("Default number of threads exceeds device limit, capping at %d\n",
-        DeviceInfo.ThreadsPerBlock[device_id]);
+    return OFFLOAD_SUCCESS;
   }
 
-  return OFFLOAD_SUCCESS;
-}
+  __tgt_target_table *loadBinary(const int DeviceId,
+                                 const __tgt_device_image *Image) {
+    // Set the context we are using
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return nullptr;
+
+    // Clear the offload table as we are going to create a new one.
+    clearOffloadEntriesTable(DeviceId);
+
+    // Create the module and extract the function pointers.
+    CUmodule Module;
+    DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
+    Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
+    if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
+      return nullptr;
+
+    DP("CUDA module successfully loaded!\n");
+
+    Modules.push_back(Module);
+
+    // Find the symbols in the module by name.
+    const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
+    const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
+
+    std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
+    for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
+      if (!E->addr) {
+        // We return nullptr when something like this happens, the host should
+        // have always something in the address to uniquely identify the target
+        // region.
+        DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
+        return nullptr;
+      }
 
-__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
-    __tgt_device_image *image) {
-
-  // Set the context we are using.
-  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when setting a CUDA context for device %d\n", device_id);
-    CUDA_ERR_STRING(err);
-    return NULL;
-  }
+      if (E->size) {
+        __tgt_offload_entry Entry = *E;
+        CUdeviceptr CUPtr;
+        size_t CUSize;
+        Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
+        // We keep this style here because we need the name
+        if (Err != CUDA_SUCCESS) {
+          REPORT("Loading global '%s' Failed\n", E->name);
+          CUDA_ERR_STRING(Err);
+          return nullptr;
+        }
+
+        if (CUSize != E->size) {
+          DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
+             CUSize, E->size);
+          return nullptr;
+        }
 
-  // Clear the offload table as we are going to create a new one.
-  DeviceInfo.clearOffloadEntriesTable(device_id);
+        DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
+           DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
+
+        Entry.addr = (void *)(CUPtr);
+
+        // Note: In the current implementation declare target variables
+        // can either be link or to. This means that once unified
+        // memory is activated via the requires directive, the variable
+        // can be used directly from the host in both cases.
+        // TODO: when variables types other than to or link are added,
+        // the below condition should be changed to explicitly
+        // check for to and link variables types:
+        // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
+        // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
+        if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+          // If unified memory is present any target link or to variables
+          // can access host addresses directly. There is no longer a
+          // need for device copies.
+          cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
+          DP("Copy linked variable host address (" DPxMOD
+             ") to device address (" DPxMOD ")\n",
+             DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
+        }
 
-  // Create the module and extract the function pointers.
+        addOffloadEntry(DeviceId, Entry);
 
-  CUmodule cumod;
-  DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));
-  err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when loading CUDA module\n");
-    CUDA_ERR_STRING(err);
-    return NULL;
-  }
+        continue;
+      }
 
-  DP("CUDA module successfully loaded!\n");
-  DeviceInfo.Modules.push_back(cumod);
+      CUfunction Func;
+      Err = cuModuleGetFunction(&Func, Module, E->name);
+      // We keep this style here because we need the name
+      if (Err != CUDA_SUCCESS) {
+        REPORT("Loading '%s' Failed\n", E->name);
+        CUDA_ERR_STRING(Err);
+        return nullptr;
+      }
 
-  // Find the symbols in the module by name.
-  __tgt_offload_entry *HostBegin = image->EntriesBegin;
-  __tgt_offload_entry *HostEnd = image->EntriesEnd;
+      DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
+         DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
+
+      // default value GENERIC (in case symbol is missing from cubin file)
+      int8_t ExecModeVal = ExecutionModeType::GENERIC;
+      std::string ExecModeNameStr(E->name);
+      ExecModeNameStr += "_exec_mode";
+      const char *ExecModeName = ExecModeNameStr.c_str();
+
+      CUdeviceptr ExecModePtr;
+      size_t CUSize;
+      Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
+      if (Err == CUDA_SUCCESS) {
+        if (CUSize != sizeof(int8_t)) {
+          DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
+             ExecModeName, CUSize, sizeof(int8_t));
+          return nullptr;
+        }
 
-  for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
+        Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
+        if (Err != CUDA_SUCCESS) {
+          REPORT("Error when copying data from device to host. Pointers: "
+                 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
+                 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
+          CUDA_ERR_STRING(Err);
+          return nullptr;
+        }
 
-    if (!e->addr) {
-      // We return NULL when something like this happens, the host should have
-      // always something in the address to uniquely identify the target region.
-      DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);
+        if (ExecModeVal < 0 || ExecModeVal > 1) {
+          DP("Error wrong exec_mode value specified in cubin file: %d\n",
+             ExecModeVal);
+          return nullptr;
+        }
+      } else {
+        REPORT("Loading global exec_mode '%s' - symbol missing, using default "
+               "value GENERIC (1)\n",
+               ExecModeName);
+        CUDA_ERR_STRING(Err);
+      }
+
+      KernelsList.emplace_back(Func, ExecModeVal);
 
-      return NULL;
+      __tgt_offload_entry Entry = *E;
+      Entry.addr = &KernelsList.back();
+      addOffloadEntry(DeviceId, Entry);
     }
 
-    if (e->size) {
-      __tgt_offload_entry entry = *e;
+    // send device environment data to the device
+    {
+      omptarget_device_environmentTy DeviceEnv{0};
 
-      CUdeviceptr cuptr;
-      size_t cusize;
-      err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);
+#ifdef OMPTARGET_DEBUG
+      if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
+        DeviceEnv.debug_level = std::stoi(EnvStr);
+#endif
 
-      if (err != CUDA_SUCCESS) {
-        DP("Loading global '%s' (Failed)\n", e->name);
-        CUDA_ERR_STRING(err);
-        return NULL;
-      }
+      const char *DeviceEnvName = "omptarget_device_environment";
+      CUdeviceptr DeviceEnvPtr;
+      size_t CUSize;
+
+      Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
+      if (Err == CUDA_SUCCESS) {
+        if (CUSize != sizeof(DeviceEnv)) {
+          REPORT(
+              "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
+              DeviceEnvName, CUSize, sizeof(int32_t));
+          CUDA_ERR_STRING(Err);
+          return nullptr;
+        }
 
-      if (cusize != e->size) {
-        DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,
-            cusize, e->size);
-        CUDA_ERR_STRING(err);
-        return NULL;
-      }
+        Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
+        if (Err != CUDA_SUCCESS) {
+          REPORT("Error when copying data from host to device. Pointers: "
+                 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
+                 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
+          CUDA_ERR_STRING(Err);
+          return nullptr;
+        }
 
-      DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
-          DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));
-      entry.addr = (void *)cuptr;
-
-      // Note: In the current implementation declare target variables
-      // can either be link or to. This means that once unified
-      // memory is activated via the requires directive, the variable
-      // can be used directly from the host in both cases.
-      // TODO: when variables types other than to or link are added,
-      // the below condition should be changed to explicitely
-      // check for to and link variables types:
-      //  (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
-      //   (e->flags & OMP_DECLARE_TARGET_LINK ||
-      //    e->flags == OMP_DECLARE_TARGET_TO))
-      if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
-        // If unified memory is present any target link or to variables
-        // can access host addresses directly. There is no longer a
-        // need for device copies.
-        cuMemcpyHtoD(cuptr, e->addr, sizeof(void *));
-        DP("Copy linked variable host address (" DPxMOD ")"
-           "to device address (" DPxMOD ")\n",
-          DPxPTR(*((void**)e->addr)), DPxPTR(cuptr));
+        DP("Sending global device environment data %zu bytes\n", CUSize);
+      } else {
+        DP("Finding global device environment '%s' - symbol missing.\n",
+           DeviceEnvName);
+        DP("Continue, considering this is a device RTL which does not accept "
+           "environment setting.\n");
       }
+    }
+
+    return getOffloadEntriesTable(DeviceId);
+  }
+
+  void *dataAlloc(const int DeviceId, const int64_t Size) {
+    if (UseMemoryManager)
+      return MemoryManagers[DeviceId]->allocate(Size, nullptr);
+
+    return DeviceAllocators[DeviceId].allocate(Size, nullptr);
+  }
+
+  int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
+                 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
+    assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
 
-      DeviceInfo.addOffloadEntry(device_id, entry);
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return OFFLOAD_FAIL;
 
-      continue;
+    CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
+
+    Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
+    if (Err != CUDA_SUCCESS) {
+      REPORT("Error when copying data from host to device. Pointers: host "
+             "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
+             DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
+      CUDA_ERR_STRING(Err);
+      return OFFLOAD_FAIL;
     }
 
-    CUfunction fun;
-    err = cuModuleGetFunction(&fun, cumod, e->name);
+    return OFFLOAD_SUCCESS;
+  }
+
+  int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
+                   const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
+    assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return OFFLOAD_FAIL;
 
-    if (err != CUDA_SUCCESS) {
-      DP("Loading '%s' (Failed)\n", e->name);
-      CUDA_ERR_STRING(err);
-      return NULL;
+    CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
+
+    Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
+    if (Err != CUDA_SUCCESS) {
+      REPORT("Error when copying data from device to host. Pointers: host "
+             "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
+             DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
+      CUDA_ERR_STRING(Err);
+      return OFFLOAD_FAIL;
     }
 
-    DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
-        DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
-
-    // default value GENERIC (in case symbol is missing from cubin file)
-    int8_t ExecModeVal = ExecutionModeType::GENERIC;
-    std::string ExecModeNameStr (e->name);
-    ExecModeNameStr += "_exec_mode";
-    const char *ExecModeName = ExecModeNameStr.c_str();
-
-    CUdeviceptr ExecModePtr;
-    size_t cusize;
-    err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
-    if (err == CUDA_SUCCESS) {
-      if ((size_t)cusize != sizeof(int8_t)) {
-        DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
-           ExecModeName, cusize, sizeof(int8_t));
-        CUDA_ERR_STRING(err);
-        return NULL;
+    return OFFLOAD_SUCCESS;
+  }
+
+  int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
+                   int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
+    assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+
+    CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return OFFLOAD_FAIL;
+
+    CUstream Stream = getStream(SrcDevId, AsyncInfoPtr);
+
+    // If they are two devices, we try peer to peer copy first
+    if (SrcDevId != DstDevId) {
+      int CanAccessPeer = 0;
+      Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
+      if (Err != CUDA_SUCCESS) {
+        REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
+               ", dst = %" PRId32 "\n",
+               SrcDevId, DstDevId);
+        CUDA_ERR_STRING(Err);
+        return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
       }
 
-      err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
-      if (err != CUDA_SUCCESS) {
-        DP("Error when copying data from device to host. Pointers: "
-           "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
-           DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
-        CUDA_ERR_STRING(err);
-        return NULL;
+      if (!CanAccessPeer) {
+        DP("P2P memcpy not supported so fall back to D2D memcpy");
+        return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
       }
 
-      if (ExecModeVal < 0 || ExecModeVal > 1) {
-        DP("Error wrong exec_mode value specified in cubin file: %d\n",
-           ExecModeVal);
-        return NULL;
+      Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
+      if (Err != CUDA_SUCCESS) {
+        REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
+               ", dst = %" PRId32 "\n",
+               SrcDevId, DstDevId);
+        CUDA_ERR_STRING(Err);
+        return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
       }
-    } else {
-      DP("Loading global exec_mode '%s' - symbol missing, using default value "
-          "GENERIC (1)\n", ExecModeName);
-      CUDA_ERR_STRING(err);
+
+      Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
+                              (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
+                              Size, Stream);
+      if (Err == CUDA_SUCCESS)
+        return OFFLOAD_SUCCESS;
+
+      REPORT("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
+             ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32
+             "\n",
+             DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
+      CUDA_ERR_STRING(Err);
     }
 
-    KernelsList.push_back(KernelTy(fun, ExecModeVal));
+    return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
+  }
+
+  int dataDelete(const int DeviceId, void *TgtPtr) {
+    if (UseMemoryManager)
+      return MemoryManagers[DeviceId]->free(TgtPtr);
 
-    __tgt_offload_entry entry = *e;
-    entry.addr = (void *)&KernelsList.back();
-    DeviceInfo.addOffloadEntry(device_id, entry);
+    return DeviceAllocators[DeviceId].free(TgtPtr);
   }
 
-  // send device environment data to the device
-  {
-    omptarget_device_environmentTy device_env;
+  int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
+                          ptrdiff_t *TgtOffsets, const int ArgNum,
+                          const int TeamNum, const int ThreadLimit,
+                          const unsigned int LoopTripCount,
+                          __tgt_async_info *AsyncInfo) const {
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return OFFLOAD_FAIL;
+
+    // All args are references.
+    std::vector<void *> Args(ArgNum);
+    std::vector<void *> Ptrs(ArgNum);
+
+    for (int I = 0; I < ArgNum; ++I) {
+      Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
+      Args[I] = &Ptrs[I];
+    }
 
-    device_env.debug_level = 0;
+    KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
 
-#ifdef OMPTARGET_DEBUG
-    if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
-      device_env.debug_level = std::stoi(envStr);
+    int CudaThreadsPerBlock;
+    if (ThreadLimit > 0) {
+      DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
+      CudaThreadsPerBlock = ThreadLimit;
+      // Add master warp if necessary
+      if (KernelInfo->ExecutionMode == GENERIC) {
+        DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
+        CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
+      }
+    } else {
+      DP("Setting CUDA threads per block to default %d\n",
+         DeviceData[DeviceId].NumThreads);
+      CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
     }
-#endif
 
-    const char * device_env_Name="omptarget_device_environment";
-    CUdeviceptr device_env_Ptr;
-    size_t cusize;
+    if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
+      DP("Threads per block capped at device limit %d\n",
+         DeviceData[DeviceId].ThreadsPerBlock);
+      CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
+    }
 
-    err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name);
+    if (!KernelInfo->MaxThreadsPerBlock) {
+      Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
+                               CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                               KernelInfo->Func);
+      if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
+        return OFFLOAD_FAIL;
+    }
 
-    if (err == CUDA_SUCCESS) {
-      if ((size_t)cusize != sizeof(device_env)) {
-        DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
-            device_env_Name, cusize, sizeof(int32_t));
-        CUDA_ERR_STRING(err);
-        return NULL;
-      }
+    if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
+      DP("Threads per block capped at kernel limit %d\n",
+         KernelInfo->MaxThreadsPerBlock);
+      CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
+    }
 
-      err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize);
-      if (err != CUDA_SUCCESS) {
-        DP("Error when copying data from host to device. Pointers: "
-            "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
-            DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize);
-        CUDA_ERR_STRING(err);
-        return NULL;
+    unsigned int CudaBlocksPerGrid;
+    if (TeamNum <= 0) {
+      if (LoopTripCount > 0 && EnvNumTeams < 0) {
+        if (KernelInfo->ExecutionMode == SPMD) {
+          // We have a combined construct, i.e. `target teams distribute
+          // parallel for [simd]`. We launch so many teams so that each thread
+          // will execute one iteration of the loop. round up to the nearest
+          // integer
+          CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
+        } else {
+          // If we reach this point, then we have a non-combined construct, i.e.
+          // `teams distribute` with a nested `parallel for` and each team is
+          // assigned one iteration of the `distribute` loop. E.g.:
+          //
+          // #pragma omp target teams distribute
+          // for(...loop_tripcount...) {
+          //   #pragma omp parallel for
+          //   for(...) {}
+          // }
+          //
+          // Threads within a team will execute the iterations of the `parallel`
+          // loop.
+          CudaBlocksPerGrid = LoopTripCount;
+        }
+        DP("Using %d teams due to loop trip count %" PRIu32
+           " and number of threads per block %d\n",
+           CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
+      } else {
+        DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
+        CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
       }
-
-      DP("Sending global device environment data %zu bytes\n", (size_t)cusize);
+    } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
+      DP("Capping number of teams to team limit %d\n",
+         DeviceData[DeviceId].BlocksPerGrid);
+      CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
     } else {
-      DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name);
-      DP("Continue, considering this is a device RTL which does not accept envrionment setting.\n");
+      DP("Using requested number of teams %d\n", TeamNum);
+      CudaBlocksPerGrid = TeamNum;
     }
+
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Launching kernel %s with %d blocks and %d threads in %s "
+         "mode\n",
+         (getOffloadEntry(DeviceId, TgtEntryPtr))
+             ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
+             : "(null)",
+         CudaBlocksPerGrid, CudaThreadsPerBlock,
+         (KernelInfo->ExecutionMode == SPMD) ? "SPMD" : "Generic");
+
+    CUstream Stream = getStream(DeviceId, AsyncInfo);
+    Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
+                         /* gridDimZ */ 1, CudaThreadsPerBlock,
+                         /* blockDimY */ 1, /* blockDimZ */ 1,
+                         /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
+    if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
+      return OFFLOAD_FAIL;
+
+    DP("Launch of entry point at " DPxMOD " successful!\n",
+       DPxPTR(TgtEntryPtr));
+
+    return OFFLOAD_SUCCESS;
   }
 
-  return DeviceInfo.getOffloadEntriesTable(device_id);
-}
+  int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
+    CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
+    CUresult Err = cuStreamSynchronize(Stream);
+    if (Err != CUDA_SUCCESS) {
+      REPORT("Error when synchronizing stream. stream = " DPxMOD
+             ", async info ptr = " DPxMOD "\n",
+             DPxPTR(Stream), DPxPTR(AsyncInfoPtr));
+      CUDA_ERR_STRING(Err);
+      return OFFLOAD_FAIL;
+    }
 
-void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
-  if (size == 0) {
-    return NULL;
-  }
+    // Once the stream is synchronized, return it to stream pool and reset
+    // async_info. This is to make sure the synchronization only works for its
+    // own tasks.
+    StreamManager->returnStream(
+        DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
+    AsyncInfoPtr->Queue = nullptr;
 
-  // Set the context we are using.
-  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
-  if (err != CUDA_SUCCESS) {
-    DP("Error while trying to set CUDA current context\n");
-    CUDA_ERR_STRING(err);
-    return NULL;
+    return OFFLOAD_SUCCESS;
   }
+};
 
-  CUdeviceptr ptr;
-  err = cuMemAlloc(&ptr, size);
-  if (err != CUDA_SUCCESS) {
-    DP("Error while trying to allocate %d\n", err);
-    CUDA_ERR_STRING(err);
-    return NULL;
-  }
+DeviceRTLTy DeviceRTL;
+} // namespace
+
+// Exposed library API function
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-  void *vptr = (void *)ptr;
-  return vptr;
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+  return elf_check_machine(image, /* EM_CUDA */ 190);
 }
 
-int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
-    int64_t size) {
-  // Set the context we are using.
-  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when setting CUDA context\n");
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
+int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
 
-  err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when copying data from host to device. Pointers: host = " DPxMOD
-       ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
-       DPxPTR(tgt_ptr), size);
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
-  return OFFLOAD_SUCCESS;
+int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
+  DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
+  DeviceRTL.setRequiresFlag(RequiresFlags);
+  return RequiresFlags;
 }
 
-int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
-    int64_t size) {
-  // Set the context we are using.
-  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when setting CUDA context\n");
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
+int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
+  if (DeviceRTL.isValidDeviceId(src_dev_id) &&
+      DeviceRTL.isValidDeviceId(dst_dev_id))
+    return 1;
 
-  err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when copying data from device to host. Pointers: host = " DPxMOD
-        ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
-        DPxPTR(tgt_ptr), size);
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
-  return OFFLOAD_SUCCESS;
+  return 0;
 }
 
-int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
-  // Set the context we are using.
-  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when setting CUDA context\n");
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
+int32_t __tgt_rtl_init_device(int32_t device_id) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
 
-  err = cuMemFree((CUdeviceptr)tgt_ptr);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when freeing CUDA memory\n");
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
-  return OFFLOAD_SUCCESS;
+  return DeviceRTL.initDevice(device_id);
 }
 
-int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
-    void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
-    int32_t thread_limit, uint64_t loop_tripcount) {
-  // Set the context we are using.
-  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when setting CUDA context\n");
-    CUDA_ERR_STRING(err);
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
+                                          __tgt_device_image *image) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  return DeviceRTL.loadBinary(device_id, image);
+}
+
+void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  return DeviceRTL.dataAlloc(device_id, size);
+}
+
+int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
+                              int64_t size) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  __tgt_async_info async_info;
+  const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
+                                                 size, &async_info);
+  if (rc != OFFLOAD_SUCCESS)
     return OFFLOAD_FAIL;
-  }
 
-  // All args are references.
-  std::vector<void *> args(arg_num);
-  std::vector<void *> ptrs(arg_num);
+  return __tgt_rtl_synchronize(device_id, &async_info);
+}
 
-  for (int32_t i = 0; i < arg_num; ++i) {
-    ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
-    args[i] = &ptrs[i];
-  }
+int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
+                                    void *hst_ptr, int64_t size,
+                                    __tgt_async_info *async_info_ptr) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+  assert(async_info_ptr && "async_info_ptr is nullptr");
 
-  KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
+  return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
+                              async_info_ptr);
+}
 
-  int cudaThreadsPerBlock;
+int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
+                                int64_t size) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
 
-  if (thread_limit > 0) {
-    cudaThreadsPerBlock = thread_limit;
-    DP("Setting CUDA threads per block to requested %d\n", thread_limit);
-    // Add master warp if necessary
-    if (KernelInfo->ExecutionMode == GENERIC) {
-      cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
-      DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
-    }
-  } else {
-    cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
-    DP("Setting CUDA threads per block to default %d\n",
-        DeviceInfo.NumThreads[device_id]);
-  }
+  __tgt_async_info async_info;
+  const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
+                                                   size, &async_info);
+  if (rc != OFFLOAD_SUCCESS)
+    return OFFLOAD_FAIL;
 
-  if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
-    cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
-    DP("Threads per block capped at device limit %d\n",
-        DeviceInfo.ThreadsPerBlock[device_id]);
-  }
+  return __tgt_rtl_synchronize(device_id, &async_info);
+}
 
-  int kernel_limit;
-  err = cuFuncGetAttribute(&kernel_limit,
-      CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
-  if (err == CUDA_SUCCESS) {
-    if (kernel_limit < cudaThreadsPerBlock) {
-      cudaThreadsPerBlock = kernel_limit;
-      DP("Threads per block capped at kernel limit %d\n", kernel_limit);
-    }
-  }
+int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
+                                      void *tgt_ptr, int64_t size,
+                                      __tgt_async_info *async_info_ptr) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+  assert(async_info_ptr && "async_info_ptr is nullptr");
 
-  int cudaBlocksPerGrid;
-  if (team_num <= 0) {
-    if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
-      if (KernelInfo->ExecutionMode == SPMD) {
-        // We have a combined construct, i.e. `target teams distribute parallel
-        // for [simd]`. We launch so many teams so that each thread will
-        // execute one iteration of the loop.
-        // round up to the nearest integer
-        cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
-      } else {
-        // If we reach this point, then we have a non-combined construct, i.e.
-        // `teams distribute` with a nested `parallel for` and each team is
-        // assigned one iteration of the `distribute` loop. E.g.:
-        //
-        // #pragma omp target teams distribute
-        // for(...loop_tripcount...) {
-        //   #pragma omp parallel for
-        //   for(...) {}
-        // }
-        //
-        // Threads within a team will execute the iterations of the `parallel`
-        // loop.
-        cudaBlocksPerGrid = loop_tripcount;
-      }
-      DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
-          "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
-          cudaThreadsPerBlock);
-    } else {
-      cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
-      DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
-    }
-  } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
-    cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
-    DP("Capping number of teams to team limit %d\n",
-        DeviceInfo.BlocksPerGrid[device_id]);
-  } else {
-    cudaBlocksPerGrid = team_num;
-    DP("Using requested number of teams %d\n", team_num);
-  }
+  return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
+                                async_info_ptr);
+}
+
+int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
+                                      int dst_dev_id, void *dst_ptr,
+                                      int64_t size,
+                                      __tgt_async_info *async_info_ptr) {
+  assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
+  assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
+  assert(async_info_ptr && "async_info_ptr is nullptr");
+
+  return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
+                                async_info_ptr);
+}
 
-  // Run on the device.
-  DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
-     cudaThreadsPerBlock);
+int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
+                                int32_t dst_dev_id, void *dst_ptr,
+                                int64_t size) {
+  assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
+  assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
 
-  err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
-      cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0);
-  if (err != CUDA_SUCCESS) {
-    DP("Device kernel launch failed!\n");
-    CUDA_ERR_STRING(err);
+  __tgt_async_info async_info;
+  const int32_t rc = __tgt_rtl_data_exchange_async(
+      src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &async_info);
+  if (rc != OFFLOAD_SUCCESS)
     return OFFLOAD_FAIL;
-  }
 
-  DP("Launch of entry point at " DPxMOD " successful!\n",
-      DPxPTR(tgt_entry_ptr));
+  return __tgt_rtl_synchronize(src_dev_id, &async_info);
+}
+
+int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  return DeviceRTL.dataDelete(device_id, tgt_ptr);
+}
 
-  CUresult sync_err = cuCtxSynchronize();
-  if (sync_err != CUDA_SUCCESS) {
-    DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr));
-    CUDA_ERR_STRING(sync_err);
+int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
+                                         void **tgt_args,
+                                         ptrdiff_t *tgt_offsets,
+                                         int32_t arg_num, int32_t team_num,
+                                         int32_t thread_limit,
+                                         uint64_t loop_tripcount) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  __tgt_async_info async_info;
+  const int32_t rc = __tgt_rtl_run_target_team_region_async(
+      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
+      thread_limit, loop_tripcount, &async_info);
+  if (rc != OFFLOAD_SUCCESS)
     return OFFLOAD_FAIL;
-  } else {
-    DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
-  }
 
-  return OFFLOAD_SUCCESS;
+  return __tgt_rtl_synchronize(device_id, &async_info);
+}
+
+int32_t __tgt_rtl_run_target_team_region_async(
+    int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
+    ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
+    int32_t thread_limit, uint64_t loop_tripcount,
+    __tgt_async_info *async_info_ptr) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  return DeviceRTL.runTargetTeamRegion(
+      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
+      thread_limit, loop_tripcount, async_info_ptr);
 }
 
 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
-    void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
-  // use one team and the default number of threads.
-  const int32_t team_num = 1;
-  const int32_t thread_limit = 0;
-  return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
-      tgt_offsets, arg_num, team_num, thread_limit, 0);
+                                    void **tgt_args, ptrdiff_t *tgt_offsets,
+                                    int32_t arg_num) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  __tgt_async_info async_info;
+  const int32_t rc = __tgt_rtl_run_target_region_async(
+      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info);
+  if (rc != OFFLOAD_SUCCESS)
+    return OFFLOAD_FAIL;
+
+  return __tgt_rtl_synchronize(device_id, &async_info);
+}
+
+int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
+                                          void *tgt_entry_ptr, void **tgt_args,
+                                          ptrdiff_t *tgt_offsets,
+                                          int32_t arg_num,
+                                          __tgt_async_info *async_info_ptr) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  return __tgt_rtl_run_target_team_region_async(
+      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
+      /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
+      async_info_ptr);
+}
+
+int32_t __tgt_rtl_synchronize(int32_t device_id,
+                              __tgt_async_info *async_info_ptr) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+  assert(async_info_ptr && "async_info_ptr is nullptr");
+  assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
+
+  return DeviceRTL.synchronize(device_id, async_info_ptr);
 }
 
 #ifdef __cplusplus
diff --git a/libomptarget/plugins/exports b/libomptarget/plugins/exports
index a14bedf07..62bfc6e24 100644
--- a/libomptarget/plugins/exports
+++ b/libomptarget/plugins/exports
@@ -1,16 +1,24 @@
 VERS1.0 {
   global:
     __tgt_rtl_is_valid_binary;
+    __tgt_rtl_is_data_exchangable;
     __tgt_rtl_number_of_devices;
     __tgt_rtl_init_requires;
     __tgt_rtl_init_device;
     __tgt_rtl_load_binary;
     __tgt_rtl_data_alloc;
     __tgt_rtl_data_submit;
+    __tgt_rtl_data_submit_async;
     __tgt_rtl_data_retrieve;
+    __tgt_rtl_data_retrieve_async;
+    __tgt_rtl_data_exchange;
+    __tgt_rtl_data_exchange_async;
     __tgt_rtl_data_delete;
     __tgt_rtl_run_target_team_region;
+    __tgt_rtl_run_target_team_region_async;
     __tgt_rtl_run_target_region;
+    __tgt_rtl_run_target_region_async;
+    __tgt_rtl_synchronize;
   local:
     *;
 };
diff --git a/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
index 00e58d870..625518f46 100644
--- a/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
+++ b/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
@@ -22,32 +22,19 @@
 #include <string>
 #include <vector>
 
+#include "Debug.h"
 #include "omptargetplugin.h"
 
 #ifndef TARGET_NAME
 #define TARGET_NAME Generic ELF - 64bit
 #endif
+#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL"
 
 #ifndef TARGET_ELF_ID
 #define TARGET_ELF_ID 0
 #endif
 
-#ifdef OMPTARGET_DEBUG
-static int DebugLevel = 0;
-
-#define GETNAME2(name) #name
-#define GETNAME(name) GETNAME2(name)
-#define DP(...) \
-  do { \
-    if (DebugLevel > 0) { \
-      DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
-    } \
-  } while (false)
-#else // OMPTARGET_DEBUG
-#define DP(...) {}
-#endif // OMPTARGET_DEBUG
-
-#include "../../common/elf_common.c"
+#include "elf_common.h"
 
 #define NUMBER_OF_DEVICES 4
 #define OFFLOADSECTIONNAME "omp_offloading_entries"
@@ -107,11 +94,6 @@ class RTLDeviceInfoTy {
   }
 
   RTLDeviceInfoTy(int32_t num_devices) {
-#ifdef OMPTARGET_DEBUG
-    if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
-      DebugLevel = std::stoi(envStr);
-    }
-#endif // OMPTARGET_DEBUG
 
     FuncGblEntries.resize(num_devices);
   }
@@ -294,8 +276,11 @@ int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
 }
 
 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
-    void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
-    int32_t thread_limit, uint64_t loop_tripcount /*not used*/) {
+                                         void **tgt_args,
+                                         ptrdiff_t *tgt_offsets,
+                                         int32_t arg_num, int32_t team_num,
+                                         int32_t thread_limit,
+                                         uint64_t loop_tripcount /*not used*/) {
   // ignore team num and thread limit.
 
   // Use libffi to launch execution.
@@ -328,10 +313,11 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
 }
 
 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
-    void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
+                                    void **tgt_args, ptrdiff_t *tgt_offsets,
+                                    int32_t arg_num) {
   // use one team and one thread.
   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
-      tgt_offsets, arg_num, 1, 1, 0);
+                                          tgt_offsets, arg_num, 1, 1, 0);
 }
 
 #ifdef __cplusplus
diff --git a/libomptarget/plugins/ve/CMakeLists.txt b/libomptarget/plugins/ve/CMakeLists.txt
new file mode 100644
index 000000000..1879b2265
--- /dev/null
+++ b/libomptarget/plugins/ve/CMakeLists.txt
@@ -0,0 +1,60 @@
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a NEC Aurora machine if available. (Can also run on host)
+#
+##===----------------------------------------------------------------------===##
+
+
+if(${LIBOMPTARGET_DEP_VEO_FOUND})
+  libomptarget_say("Building SX-Aurora VE offloading plugin.")
+  set(additional_libs "")
+  set(additional_libs ${LIBOMPTARGET_DEP_VEO_LIBRARIES}
+                      ${LIBOMPTARGET_DEP_VEOSINFO_LIBRARIES}
+                      ${additional_libs})
+
+  set(tmachine_name "ve")
+  set(tmachine_libname "ve")
+  set(tmachine_triple "ve-unknown-linux-unknown")
+  set(elf_machine_id 251)
+
+  include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR})
+  include_directories(${LIBOMPTARGET_DEP_VEO_INCLUDE_DIR})
+
+
+  # Define macro to be used as prefix of the runtime messages for this target.
+  add_definitions("-DTARGET_NAME=${tmachine_name}")
+
+  # Define macro with the ELF ID for this target.
+  add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
+
+  add_library("bolt-omptarget.rtl.${tmachine_libname}" SHARED
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.cpp)
+
+  # Install plugin under the lib destination folder.
+  install(TARGETS "bolt-omptarget.rtl.${tmachine_libname}"
+    LIBRARY DESTINATION lib${OPENMP_LIBDIR_SUFFIX})
+
+  # install aliases for BOLT
+  add_custom_command(TARGET bolt-omptarget.rtl.${tmachine_libname} POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_SHARED_LIBRARY_PREFIX}bolt-omptarget.rtl.${tmachine_libname}${CMAKE_SHARED_LIBRARY_SUFFIX}
+    ${CMAKE_SHARED_LIBRARY_PREFIX}omptarget.rtl.${tmachine_libname}${CMAKE_SHARED_LIBRARY_SUFFIX}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  )
+  install(CODE "execute_process(COMMAND \"\${CMAKE_COMMAND}\" -E create_symlink \"${CMAKE_SHARED_LIBRARY_PREFIX}bolt-omptarget.rtl.${tmachine_libname}${CMAKE_SHARED_LIBRARY_SUFFIX}\"
+    \"${CMAKE_SHARED_LIBRARY_PREFIX}omptarget.rtl.${tmachine_libname}${CMAKE_SHARED_LIBRARY_SUFFIX}\" WORKING_DIRECTORY
+    \$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/${OPENMP_INSTALL_LIBDIR})")
+
+  target_link_libraries(
+    "bolt-omptarget.rtl.${tmachine_libname}"
+    elf_common
+    ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES}
+    ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
+    ${additional_libs}
+    "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports -Wl,-z,defs")
+
+  # Report to the parent scope that we are building a plugin.
+  set(LIBOMPTARGET_SYSTEM_TARGETS
+    "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE)
+else()
+    libomptarget_say("Not building nec-aurora plugin: libveo or libveosinfo not found.")
+endif()
diff --git a/libomptarget/plugins/ve/src/rtl.cpp b/libomptarget/plugins/ve/src/rtl.cpp
new file mode 100644
index 000000000..284ee04c8
--- /dev/null
+++ b/libomptarget/plugins/ve/src/rtl.cpp
@@ -0,0 +1,450 @@
+//===-RTLs/nec-aurora/src/rtl.cpp - Target RTLs Implementation - C++ -*-======//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for NEC Aurora TSUBASA machines
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cassert>
+#include <cerrno>
+#include <cstring>
+#include <list>
+#include <stdlib.h>
+#include <string>
+#include <sys/stat.h>
+#include <ve_offload.h>
+#include <vector>
+#include <veosinfo/veosinfo.h>
+
+#include "Debug.h"
+#include "omptargetplugin.h"
+
+#ifndef TARGET_NAME
+#define TARGET_NAME VE
+#endif
+
+#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
+
+#ifndef TARGET_ELF_ID
+#define TARGET_ELF_ID 0
+#endif
+
+#include "elf_common.h"
+
+struct DynLibTy {
+  char *FileName;
+  uint64_t VeoLibHandle;
+};
+
+/// Keep entries table per device.
+struct FuncOrGblEntryTy {
+  __tgt_target_table Table;
+  std::vector<__tgt_offload_entry> Entries;
+};
+
+class RTLDeviceInfoTy {
+  std::vector<std::list<FuncOrGblEntryTy>> FuncOrGblEntry;
+
+public:
+  std::vector<struct veo_proc_handle *> ProcHandles;
+  std::vector<struct veo_thr_ctxt *> Contexts;
+  std::vector<uint64_t> LibraryHandles;
+  std::list<DynLibTy> DynLibs;
+  // Maps OpenMP device Ids to Ve nodeids
+  std::vector<int> NodeIds;
+
+  void buildOffloadTableFromHost(int32_t device_id, uint64_t VeoLibHandle,
+                                 __tgt_offload_entry *HostBegin,
+                                 __tgt_offload_entry *HostEnd) {
+    FuncOrGblEntry[device_id].emplace_back();
+    std::vector<__tgt_offload_entry> &T =
+        FuncOrGblEntry[device_id].back().Entries;
+    T.clear();
+    for (__tgt_offload_entry *i = HostBegin; i != HostEnd; ++i) {
+      char *SymbolName = i->name;
+      // we have not enough access to the target memory to conveniently parse
+      // the offload table there so we need to lookup every symbol with the host
+      // table
+      DP("Looking up symbol: %s\n", SymbolName);
+      uint64_t SymbolTargetAddr =
+          veo_get_sym(ProcHandles[device_id], VeoLibHandle, SymbolName);
+      __tgt_offload_entry Entry;
+
+      if (!SymbolTargetAddr) {
+        DP("Symbol %s not found in target image\n", SymbolName);
+        Entry = {NULL, NULL, 0, 0, 0};
+      } else {
+        DP("Found symbol %s successfully in target image (addr: %p)\n",
+           SymbolName, reinterpret_cast<void *>(SymbolTargetAddr));
+        Entry = { reinterpret_cast<void *>(SymbolTargetAddr),
+                  i->name,
+                  i->size,
+                  i->flags,
+                  0 };
+      }
+
+      T.push_back(Entry);
+    }
+
+    FuncOrGblEntry[device_id].back().Table.EntriesBegin = &T.front();
+    FuncOrGblEntry[device_id].back().Table.EntriesEnd = &T.back() + 1;
+  }
+
+  __tgt_target_table *getOffloadTable(int32_t device_id) {
+    return &FuncOrGblEntry[device_id].back().Table;
+  }
+
+  RTLDeviceInfoTy() {
+
+    struct ve_nodeinfo node_info;
+    ve_node_info(&node_info);
+
+    // Build a predictable mapping between VE node ids and OpenMP device ids.
+    // This is necessary, because nodes can be missing or offline and (active)
+    // node ids are thus not consecutive. The entries in ve_nodeinfo may also
+    // not be in the order of their node ids.
+    for (int i = 0; i < node_info.total_node_count; ++i) {
+      if (node_info.status[i] == 0) {
+        NodeIds.push_back(node_info.nodeid[i]);
+      }
+    }
+
+    // Because the entries in ve_nodeinfo may not be in the order of their node
+    // ids, we sort NodeIds to get a predictable mapping.
+    std::sort(NodeIds.begin(), NodeIds.end());
+
+    int NumDevices = NodeIds.size();
+    DP("Found %i VE devices\n", NumDevices);
+    ProcHandles.resize(NumDevices, NULL);
+    Contexts.resize(NumDevices, NULL);
+    FuncOrGblEntry.resize(NumDevices);
+    LibraryHandles.resize(NumDevices);
+  }
+
+  ~RTLDeviceInfoTy() {
+    for (auto &ctx : Contexts) {
+      if (ctx != NULL) {
+        if (veo_context_close(ctx) != 0) {
+          DP("Failed to close VEO context.\n");
+        }
+      }
+    }
+
+    for (auto &hdl : ProcHandles) {
+      if (hdl != NULL) {
+        veo_proc_destroy(hdl);
+      }
+    }
+
+    for (auto &lib : DynLibs) {
+      if (lib.FileName) {
+        remove(lib.FileName);
+      }
+    }
+  }
+};
+
+static RTLDeviceInfoTy DeviceInfo;
+
+static int target_run_function_wait(uint32_t DeviceID, uint64_t FuncAddr,
+                                    struct veo_args *args, uint64_t *RetVal) {
+  DP("Running function with entry point %p\n",
+     reinterpret_cast<void *>(FuncAddr));
+  uint64_t RequestHandle =
+      veo_call_async(DeviceInfo.Contexts[DeviceID], FuncAddr, args);
+  if (RequestHandle == VEO_REQUEST_ID_INVALID) {
+    DP("Execution of entry point %p failed\n",
+       reinterpret_cast<void *>(FuncAddr));
+    return OFFLOAD_FAIL;
+  }
+
+  DP("Function at address %p called (VEO request ID: %" PRIu64 ")\n",
+     reinterpret_cast<void *>(FuncAddr), RequestHandle);
+
+  int ret = veo_call_wait_result(DeviceInfo.Contexts[DeviceID], RequestHandle,
+                                 RetVal);
+  if (ret != 0) {
+    DP("Waiting for entry point %p failed (Error code %d)\n",
+       reinterpret_cast<void *>(FuncAddr), ret);
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+
+// Return the number of available devices of the type supported by the
+// target RTL.
+int32_t __tgt_rtl_number_of_devices(void) { return DeviceInfo.NodeIds.size(); }
+
+// Return an integer different from zero if the provided device image can be
+// supported by the runtime. The functionality is similar to comparing the
+// result of __tgt__rtl__load__binary to NULL. However, this is meant to be a
+// lightweight query to determine if the RTL is suitable for an image without
+// having to load the library, which can be expensive.
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
+#if TARGET_ELF_ID < 1
+  return 0;
+#else
+  return elf_check_machine(Image, TARGET_ELF_ID);
+#endif
+}
+
+// Initialize the specified device. In case of success return 0; otherwise
+// return an error code.
+int32_t __tgt_rtl_init_device(int32_t ID) {
+  DP("Available VEO version: %i\n", veo_api_version());
+
+  // At the moment we do not really initialize (i.e. create a process or
+  // context on) the device here, but in "__tgt_rtl_load_binary".
+  // The reason for this is, that, when we create a process for a statically
+  // linked binary, the VEO api needs us to already supply the binary (but we
+  // can load a dynamically linked binary later, after we create the process).
+  // At this stage, we cannot check if we have a dynamically or statically
+  // linked binary so we defer process creation until we know.
+  return OFFLOAD_SUCCESS;
+}
+
+// Pass an executable image section described by image to the specified
+// device and prepare an address table of target entities. In case of error,
+// return NULL. Otherwise, return a pointer to the built address table.
+// Individual entries in the table may also be NULL, when the corresponding
+// offload region is not supported on the target device.
+__tgt_target_table *__tgt_rtl_load_binary(int32_t ID,
+                                          __tgt_device_image *Image) {
+  DP("Dev %d: load binary from " DPxMOD " image\n", ID,
+     DPxPTR(Image->ImageStart));
+
+  assert(ID >= 0 && "bad dev id");
+
+  size_t ImageSize = (size_t)Image->ImageEnd - (size_t)Image->ImageStart;
+  size_t NumEntries = (size_t)(Image->EntriesEnd - Image->EntriesBegin);
+  DP("Expecting to have %zd entries defined.\n", NumEntries);
+
+  // load dynamic library and get the entry points. We use the dl library
+  // to do the loading of the library, but we could do it directly to avoid the
+  // dump to the temporary file.
+  //
+  // 1) Create tmp file with the library contents.
+  // 2) Use dlopen to load the file and dlsym to retrieve the symbols.
+  char tmp_name[] = "/tmp/tmpfile_XXXXXX";
+  int tmp_fd = mkstemp(tmp_name);
+
+  if (tmp_fd == -1) {
+    return NULL;
+  }
+
+  FILE *ftmp = fdopen(tmp_fd, "wb");
+
+  if (!ftmp) {
+    DP("fdopen() for %s failed. Could not write target image\n", tmp_name);
+    return NULL;
+  }
+
+  fwrite(Image->ImageStart, ImageSize, 1, ftmp);
+
+  // at least for the static case we need to change the permissions
+  chmod(tmp_name, 0700);
+
+  DP("Wrote target image to %s. ImageSize=%zu\n", tmp_name, ImageSize);
+
+  fclose(ftmp);
+
+  // See comment in "__tgt_rtl_init_device"
+  bool is_dyn = true;
+  if (DeviceInfo.ProcHandles[ID] == NULL) {
+    struct veo_proc_handle *proc_handle;
+    is_dyn = elf_is_dynamic(Image);
+    // If we have a dynamically linked image, we create the process handle, then
+    // the thread, and then load the image.
+    // If we have a statically linked image, we need to create the process
+    // handle and load the image at the same time with veo_proc_create_static().
+    if (is_dyn) {
+      proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]);
+      if (!proc_handle) {
+        DP("veo_proc_create() failed for device %d\n", ID);
+        return NULL;
+      }
+    } else {
+      proc_handle = veo_proc_create_static(DeviceInfo.NodeIds[ID], tmp_name);
+      if (!proc_handle) {
+        DP("veo_proc_create_static() failed for device %d, image=%s\n", ID,
+           tmp_name);
+        return NULL;
+      }
+    }
+    DeviceInfo.ProcHandles[ID] = proc_handle;
+  }
+
+  if (DeviceInfo.Contexts[ID] == NULL) {
+    struct veo_thr_ctxt *ctx = veo_context_open(DeviceInfo.ProcHandles[ID]);
+
+    if (!ctx) {
+      DP("veo_context_open() failed: %s\n", std::strerror(errno));
+      return NULL;
+    }
+
+    DeviceInfo.Contexts[ID] = ctx;
+  }
+
+  DP("Aurora device successfully initialized with loaded binary: "
+     "proc_handle=%p, ctx=%p\n",
+     DeviceInfo.ProcHandles[ID], DeviceInfo.Contexts[ID]);
+
+  uint64_t LibHandle = 0UL;
+  if (is_dyn) {
+    LibHandle = veo_load_library(DeviceInfo.ProcHandles[ID], tmp_name);
+
+    if (!LibHandle) {
+      DP("veo_load_library() failed: LibHandle=%" PRIu64
+         " Name=%s. Set env VEORUN_BIN for static linked target code.\n",
+         LibHandle, tmp_name);
+      return NULL;
+    }
+
+    DP("Successfully loaded library dynamically\n");
+  } else {
+    DP("Symbol table is expected to have been created by "
+       "veo_create_proc_static()\n");
+  }
+
+  DynLibTy Lib = {tmp_name, LibHandle};
+  DeviceInfo.DynLibs.push_back(Lib);
+  DeviceInfo.LibraryHandles[ID] = LibHandle;
+
+  DeviceInfo.buildOffloadTableFromHost(ID, LibHandle, Image->EntriesBegin,
+                                       Image->EntriesEnd);
+
+  return DeviceInfo.getOffloadTable(ID);
+}
+
+// Allocate data on the particular target device, of the specified size.
+// HostPtr is a address of the host data the allocated target data
+// will be associated with (HostPtr may be NULL if it is not known at
+// allocation time, like for example it would be for target data that
+// is allocated by omp_target_alloc() API). Return address of the
+// allocated data on the target that will be used by libomptarget.so to
+// initialize the target data mapping structures. These addresses are
+// used to generate a table of target variables to pass to
+// __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in
+// case an error occurred on the target device.
+void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr) {
+  int ret;
+  uint64_t addr;
+
+  if (DeviceInfo.ProcHandles[ID] == NULL) {
+    struct veo_proc_handle *proc_handle;
+    proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]);
+    if (!proc_handle) {
+      DP("veo_proc_create() failed for device %d\n", ID);
+      return NULL;
+    }
+    DeviceInfo.ProcHandles[ID] = proc_handle;
+    DP("Aurora device successfully initialized: proc_handle=%p", proc_handle);
+  }
+
+  ret = veo_alloc_mem(DeviceInfo.ProcHandles[ID], &addr, Size);
+  DP("Allocate target memory: device=%d, target addr=%p, size=%" PRIu64 "\n",
+     ID, reinterpret_cast<void *>(addr), Size);
+  if (ret != 0) {
+    DP("veo_alloc_mem(%d, %p, %" PRIu64 ") failed with error code %d\n",
+       ID, reinterpret_cast<void *>(addr), Size, ret);
+    return NULL;
+  }
+
+  return reinterpret_cast<void *>(addr);
+}
+
+// Pass the data content to the target device using the target address.
+// In case of success, return zero. Otherwise, return an error code.
+int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr,
+                              int64_t Size) {
+  int ret = veo_write_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr,
+                          HostPtr, (size_t)Size);
+  if (ret != 0) {
+    DP("veo_write_mem() failed with error code %d\n", ret);
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+// Retrieve the data content from the target device using its address.
+// In case of success, return zero. Otherwise, return an error code.
+int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr,
+                                int64_t Size) {
+  int ret = veo_read_mem(DeviceInfo.ProcHandles[ID], HostPtr,
+                         (uint64_t)TargetPtr, Size);
+  if (ret != 0) {
+    DP("veo_read_mem() failed with error code %d\n", ret);
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+// De-allocate the data referenced by target ptr on the device. In case of
+// success, return zero. Otherwise, return an error code.
+int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr) {
+  int ret =  veo_free_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr);
+
+  if (ret != 0) {
+    DP("veo_free_mem() failed with error code %d\n", ret);
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+// Similar to __tgt_rtl_run_target_region, but additionally specify the
+// number of teams to be created and a number of threads in each team.
+int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args,
+                                         ptrdiff_t *Offsets, int32_t NumArgs,
+                                         int32_t NumTeams, int32_t ThreadLimit,
+                                         uint64_t loop_tripcount) {
+  int ret;
+
+  // ignore team num and thread limit.
+  std::vector<void *> ptrs(NumArgs);
+
+  struct veo_args *TargetArgs;
+  TargetArgs = veo_args_alloc();
+
+  if (TargetArgs == NULL) {
+    DP("Could not allocate VEO args\n");
+    return OFFLOAD_FAIL;
+  }
+
+  for (int i = 0; i < NumArgs; ++i) {
+    ret = veo_args_set_u64(TargetArgs, i, (intptr_t)Args[i]);
+
+    if (ret != 0) {
+      DP("veo_args_set_u64() has returned %d for argnum=%d and value %p\n",
+         ret, i, Args[i]);
+      return OFFLOAD_FAIL;
+    }
+  }
+
+  uint64_t RetVal;
+  if (target_run_function_wait(ID, reinterpret_cast<uint64_t>(Entry),
+                               TargetArgs, &RetVal) != OFFLOAD_SUCCESS) {
+    veo_args_free(TargetArgs);
+    return OFFLOAD_FAIL;
+  }
+  veo_args_free(TargetArgs);
+  return OFFLOAD_SUCCESS;
+}
+
+// Transfer control to the offloaded entry Entry on the target device.
+// Args and Offsets are arrays of NumArgs size of target addresses and
+// offsets. An offset should be added to the target address before passing it
+// to the outlined function on device side. In case of success, return zero.
+// Otherwise, return an error code.
+int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args,
+                                    ptrdiff_t *Offsets, int32_t NumArgs) {
+  return __tgt_rtl_run_target_team_region(ID, Entry, Args, Offsets, NumArgs, 1,
+                                          1, 0);
+}
diff --git a/libomptarget/src/CMakeLists.txt b/libomptarget/src/CMakeLists.txt
index f30087ed4..984227b33 100644
--- a/libomptarget/src/CMakeLists.txt
+++ b/libomptarget/src/CMakeLists.txt
@@ -1,9 +1,9 @@
 ##===----------------------------------------------------------------------===##
-# 
+#
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# 
+#
 ##===----------------------------------------------------------------------===##
 #
 # Build offloading library libomptarget.so.
@@ -12,20 +12,53 @@
 
 libomptarget_say("Building offloading runtime library libomptarget.")
 
-set(src_files
-  api.cpp
-  device.cpp
-  interface.cpp
-  rtl.cpp
-  omptarget.cpp
+set(LIBOMPTARGET_SRC_FILES
+  ${CMAKE_CURRENT_SOURCE_DIR}/api.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/interface.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/rtl.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/omptarget.cpp
 )
 
-# Build libomptarget library with libdl dependency.
-add_library(omptarget SHARED ${src_files})
-target_link_libraries(omptarget
-  ${CMAKE_DL_LIBS}
-  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports")
+include_directories(${LIBOMPTARGET_LLVM_INCLUDE_DIRS})
+
+# Build libomptarget library with libdl dependency. Add LLVMSupport
+# dependency if building in-tree with profiling enabled.
+if(OPENMP_STANDALONE_BUILD OR (NOT OPENMP_ENABLE_LIBOMPTARGET_PROFILING))
+  add_library(bolt-omptarget SHARED ${LIBOMPTARGET_SRC_FILES})
+  target_link_libraries(bolt-omptarget
+    ${CMAKE_DL_LIBS}
+    "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports")
+else()
+  set(LLVM_LINK_COMPONENTS
+    Support
+    )
+  add_llvm_library(bolt-omptarget SHARED ${LIBOMPTARGET_SRC_FILES}
+      LINK_LIBS ${CMAKE_DL_LIBS}
+      "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports"
+      )
+  target_compile_definitions(bolt-omptarget PUBLIC OMPTARGET_PROFILE_ENABLED)
+endif()
+
+# libomptarget needs to be set separately because add_llvm_library doesn't
+# conform with location configuration of its parent scope.
+set_target_properties(bolt-omptarget
+  PROPERTIES
+  LIBRARY_OUTPUT_DIRECTORY ${LIBOMPTARGET_LIBRARY_DIR})
 
 # Install libomptarget under the lib destination folder.
-install(TARGETS omptarget LIBRARY COMPONENT omptarget
+install(TARGETS bolt-omptarget LIBRARY COMPONENT omptarget
   DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+
+# Install aliases
+get_target_property(BOLT_LIBOMPTARGET_LIBRARY_DIR bolt-omptarget LIBRARY_OUTPUT_DIRECTORY)
+if(BOLT_LIBOMPTARGET_LIBRARY_DIR)
+  add_custom_command(TARGET bolt-omptarget POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_SHARED_LIBRARY_PREFIX}bolt-omptarget${CMAKE_SHARED_LIBRARY_SUFFIX}
+      ${CMAKE_SHARED_LIBRARY_PREFIX}omptarget${CMAKE_SHARED_LIBRARY_SUFFIX}
+    WORKING_DIRECTORY ${BOLT_LIBOMPTARGET_LIBRARY_DIR}
+  )
+endif()
+install(CODE "execute_process(COMMAND \"\${CMAKE_COMMAND}\" -E create_symlink \"${CMAKE_SHARED_LIBRARY_PREFIX}bolt-omptarget${CMAKE_SHARED_LIBRARY_SUFFIX}\"
+  \"${CMAKE_SHARED_LIBRARY_PREFIX}omptarget${CMAKE_SHARED_LIBRARY_SUFFIX}\" WORKING_DIRECTORY
+  \$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/${OPENMP_INSTALL_LIBDIR})")
diff --git a/libomptarget/src/api.cpp b/libomptarget/src/api.cpp
index f93302685..2d3761093 100644
--- a/libomptarget/src/api.cpp
+++ b/libomptarget/src/api.cpp
@@ -10,8 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <omptarget.h>
-
 #include "device.h"
 #include "private.h"
 #include "rtl.h"
@@ -21,21 +19,25 @@
 #include <cstdlib>
 
 EXTERN int omp_get_num_devices(void) {
-  RTLsMtx.lock();
-  size_t Devices_size = Devices.size();
-  RTLsMtx.unlock();
+  TIMESCOPE();
+  PM->RTLsMtx.lock();
+  size_t DevicesSize = PM->Devices.size();
+  PM->RTLsMtx.unlock();
 
-  DP("Call to omp_get_num_devices returning %zd\n", Devices_size);
+  DP("Call to omp_get_num_devices returning %zd\n", DevicesSize);
 
-  return Devices_size;
+  return DevicesSize;
 }
 
 EXTERN int omp_get_initial_device(void) {
-  DP("Call to omp_get_initial_device returning %d\n", HOST_DEVICE);
-  return HOST_DEVICE;
+  TIMESCOPE();
+  int hostDevice = omp_get_num_devices();
+  DP("Call to omp_get_initial_device returning %d\n", hostDevice);
+  return hostDevice;
 }
 
 EXTERN void *omp_target_alloc(size_t size, int device_num) {
+  TIMESCOPE();
   DP("Call to omp_target_alloc for device %d requesting %zu bytes\n",
       device_num, size);
 
@@ -57,13 +59,13 @@ EXTERN void *omp_target_alloc(size_t size, int device_num) {
     return NULL;
   }
 
-  DeviceTy &Device = Devices[device_num];
-  rc = Device.RTL->data_alloc(Device.RTLDeviceID, size, NULL);
+  rc = PM->Devices[device_num].allocData(size);
   DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc));
   return rc;
 }
 
 EXTERN void omp_target_free(void *device_ptr, int device_num) {
+  TIMESCOPE();
   DP("Call to omp_target_free for device %d and address " DPxMOD "\n",
       device_num, DPxPTR(device_ptr));
 
@@ -83,12 +85,12 @@ EXTERN void omp_target_free(void *device_ptr, int device_num) {
     return;
   }
 
-  DeviceTy &Device = Devices[device_num];
-  Device.RTL->data_delete(Device.RTLDeviceID, (void *)device_ptr);
+  PM->Devices[device_num].deleteData(device_ptr);
   DP("omp_target_free deallocated device ptr\n");
 }
 
 EXTERN int omp_target_is_present(void *ptr, int device_num) {
+  TIMESCOPE();
   DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n",
       device_num, DPxPTR(ptr));
 
@@ -102,16 +104,16 @@ EXTERN int omp_target_is_present(void *ptr, int device_num) {
     return true;
   }
 
-  RTLsMtx.lock();
-  size_t Devices_size = Devices.size();
-  RTLsMtx.unlock();
-  if (Devices_size <= (size_t)device_num) {
+  PM->RTLsMtx.lock();
+  size_t DevicesSize = PM->Devices.size();
+  PM->RTLsMtx.unlock();
+  if (DevicesSize <= (size_t)device_num) {
     DP("Call to omp_target_is_present with invalid device ID, returning "
         "false\n");
     return false;
   }
 
-  DeviceTy& Device = Devices[device_num];
+  DeviceTy &Device = PM->Devices[device_num];
   bool IsLast; // not used
   bool IsHostPtr;
   void *TgtPtr = Device.getTgtPtrBegin(ptr, 0, IsLast, false, IsHostPtr);
@@ -120,7 +122,7 @@ EXTERN int omp_target_is_present(void *ptr, int device_num) {
   // getTgtPtrBegin() function which means that there is no device
   // corresponding point for ptr. This function should return false
   // in that situation.
-  if (RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)
+  if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)
     rc = !IsHostPtr;
   DP("Call to omp_target_is_present returns %d\n", rc);
   return rc;
@@ -128,24 +130,30 @@ EXTERN int omp_target_is_present(void *ptr, int device_num) {
 
 EXTERN int omp_target_memcpy(void *dst, void *src, size_t length,
     size_t dst_offset, size_t src_offset, int dst_device, int src_device) {
+  TIMESCOPE();
   DP("Call to omp_target_memcpy, dst device %d, src device %d, "
       "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
       "src offset %zu, length %zu\n", dst_device, src_device, DPxPTR(dst),
       DPxPTR(src), dst_offset, src_offset, length);
 
   if (!dst || !src || length <= 0) {
-    DP("Call to omp_target_memcpy with invalid arguments\n");
+    if (length == 0) {
+      DP("Call to omp_target_memcpy with zero length, nothing to do\n");
+      return OFFLOAD_SUCCESS;
+    }
+
+    REPORT("Call to omp_target_memcpy with invalid arguments\n");
     return OFFLOAD_FAIL;
   }
 
   if (src_device != omp_get_initial_device() && !device_is_ready(src_device)) {
-      DP("omp_target_memcpy returns OFFLOAD_FAIL\n");
-      return OFFLOAD_FAIL;
+    REPORT("omp_target_memcpy returns OFFLOAD_FAIL\n");
+    return OFFLOAD_FAIL;
   }
 
   if (dst_device != omp_get_initial_device() && !device_is_ready(dst_device)) {
-      DP("omp_target_memcpy returns OFFLOAD_FAIL\n");
-      return OFFLOAD_FAIL;
+    REPORT("omp_target_memcpy returns OFFLOAD_FAIL\n");
+    return OFFLOAD_FAIL;
   }
 
   int rc = OFFLOAD_SUCCESS;
@@ -160,20 +168,29 @@ EXTERN int omp_target_memcpy(void *dst, void *src, size_t length,
       rc = OFFLOAD_FAIL;
   } else if (src_device == omp_get_initial_device()) {
     DP("copy from host to device\n");
-    DeviceTy& DstDev = Devices[dst_device];
-    rc = DstDev.data_submit(dstAddr, srcAddr, length);
+    DeviceTy &DstDev = PM->Devices[dst_device];
+    rc = DstDev.submitData(dstAddr, srcAddr, length, nullptr);
   } else if (dst_device == omp_get_initial_device()) {
     DP("copy from device to host\n");
-    DeviceTy& SrcDev = Devices[src_device];
-    rc = SrcDev.data_retrieve(dstAddr, srcAddr, length);
+    DeviceTy &SrcDev = PM->Devices[src_device];
+    rc = SrcDev.retrieveData(dstAddr, srcAddr, length, nullptr);
   } else {
     DP("copy from device to device\n");
+    DeviceTy &SrcDev = PM->Devices[src_device];
+    DeviceTy &DstDev = PM->Devices[dst_device];
+    // First try to use D2D memcpy which is more efficient. If fails, fall back
+    // to unefficient way.
+    if (SrcDev.isDataExchangable(DstDev)) {
+      rc = SrcDev.dataExchange(srcAddr, DstDev, dstAddr, length, nullptr);
+      if (rc == OFFLOAD_SUCCESS)
+        return OFFLOAD_SUCCESS;
+    }
+
     void *buffer = malloc(length);
-    DeviceTy& SrcDev = Devices[src_device];
-    DeviceTy& DstDev = Devices[dst_device];
-    rc = SrcDev.data_retrieve(buffer, srcAddr, length);
+    rc = SrcDev.retrieveData(buffer, srcAddr, length, nullptr);
     if (rc == OFFLOAD_SUCCESS)
-      rc = DstDev.data_submit(dstAddr, buffer, length);
+      rc = DstDev.submitData(dstAddr, buffer, length, nullptr);
+    free(buffer);
   }
 
   DP("omp_target_memcpy returns %d\n", rc);
@@ -184,6 +201,7 @@ EXTERN int omp_target_memcpy_rect(void *dst, void *src, size_t element_size,
     int num_dims, const size_t *volume, const size_t *dst_offsets,
     const size_t *src_offsets, const size_t *dst_dimensions,
     const size_t *src_dimensions, int dst_device, int src_device) {
+  TIMESCOPE();
   DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, "
       "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
       "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
@@ -200,7 +218,7 @@ EXTERN int omp_target_memcpy_rect(void *dst, void *src, size_t element_size,
 
   if (!dst || !src || element_size < 1 || num_dims < 1 || !volume ||
       !dst_offsets || !src_offsets || !dst_dimensions || !src_dimensions) {
-    DP("Call to omp_target_memcpy_rect with invalid arguments\n");
+    REPORT("Call to omp_target_memcpy_rect with invalid arguments\n");
     return OFFLOAD_FAIL;
   }
 
@@ -238,26 +256,27 @@ EXTERN int omp_target_memcpy_rect(void *dst, void *src, size_t element_size,
 
 EXTERN int omp_target_associate_ptr(void *host_ptr, void *device_ptr,
     size_t size, size_t device_offset, int device_num) {
+  TIMESCOPE();
   DP("Call to omp_target_associate_ptr with host_ptr " DPxMOD ", "
       "device_ptr " DPxMOD ", size %zu, device_offset %zu, device_num %d\n",
       DPxPTR(host_ptr), DPxPTR(device_ptr), size, device_offset, device_num);
 
   if (!host_ptr || !device_ptr || size <= 0) {
-    DP("Call to omp_target_associate_ptr with invalid arguments\n");
+    REPORT("Call to omp_target_associate_ptr with invalid arguments\n");
     return OFFLOAD_FAIL;
   }
 
   if (device_num == omp_get_initial_device()) {
-    DP("omp_target_associate_ptr: no association possible on the host\n");
+    REPORT("omp_target_associate_ptr: no association possible on the host\n");
     return OFFLOAD_FAIL;
   }
 
   if (!device_is_ready(device_num)) {
-    DP("omp_target_associate_ptr returns OFFLOAD_FAIL\n");
+    REPORT("omp_target_associate_ptr returns OFFLOAD_FAIL\n");
     return OFFLOAD_FAIL;
   }
 
-  DeviceTy& Device = Devices[device_num];
+  DeviceTy &Device = PM->Devices[device_num];
   void *device_addr = (void *)((uint64_t)device_ptr + (uint64_t)device_offset);
   int rc = Device.associatePtr(host_ptr, device_addr, size);
   DP("omp_target_associate_ptr returns %d\n", rc);
@@ -265,25 +284,27 @@ EXTERN int omp_target_associate_ptr(void *host_ptr, void *device_ptr,
 }
 
 EXTERN int omp_target_disassociate_ptr(void *host_ptr, int device_num) {
+  TIMESCOPE();
   DP("Call to omp_target_disassociate_ptr with host_ptr " DPxMOD ", "
       "device_num %d\n", DPxPTR(host_ptr), device_num);
 
   if (!host_ptr) {
-    DP("Call to omp_target_associate_ptr with invalid host_ptr\n");
+    REPORT("Call to omp_target_associate_ptr with invalid host_ptr\n");
     return OFFLOAD_FAIL;
   }
 
   if (device_num == omp_get_initial_device()) {
-    DP("omp_target_disassociate_ptr: no association possible on the host\n");
+    REPORT(
+        "omp_target_disassociate_ptr: no association possible on the host\n");
     return OFFLOAD_FAIL;
   }
 
   if (!device_is_ready(device_num)) {
-    DP("omp_target_disassociate_ptr returns OFFLOAD_FAIL\n");
+    REPORT("omp_target_disassociate_ptr returns OFFLOAD_FAIL\n");
     return OFFLOAD_FAIL;
   }
 
-  DeviceTy& Device = Devices[device_num];
+  DeviceTy &Device = PM->Devices[device_num];
   int rc = Device.disassociatePtr(host_ptr);
   DP("omp_target_disassociate_ptr returns %d\n", rc);
   return rc;
diff --git a/libomptarget/src/device.cpp b/libomptarget/src/device.cpp
index cf7e0fe0c..ee90bc647 100644
--- a/libomptarget/src/device.cpp
+++ b/libomptarget/src/device.cpp
@@ -16,50 +16,77 @@
 
 #include <cassert>
 #include <climits>
+#include <cstdio>
 #include <string>
 
-/// Map between Device ID (i.e. openmp device id) and its DeviceTy.
-DevicesTy Devices;
+DeviceTy::DeviceTy(const DeviceTy &D)
+    : DeviceID(D.DeviceID), RTL(D.RTL), RTLDeviceID(D.RTLDeviceID),
+      IsInit(D.IsInit), InitFlag(), HasPendingGlobals(D.HasPendingGlobals),
+      HostDataToTargetMap(D.HostDataToTargetMap),
+      PendingCtorsDtors(D.PendingCtorsDtors), ShadowPtrMap(D.ShadowPtrMap),
+      DataMapMtx(), PendingGlobalsMtx(), ShadowMtx(),
+      LoopTripCnt(D.LoopTripCnt) {}
+
+DeviceTy &DeviceTy::operator=(const DeviceTy &D) {
+  DeviceID = D.DeviceID;
+  RTL = D.RTL;
+  RTLDeviceID = D.RTLDeviceID;
+  IsInit = D.IsInit;
+  HasPendingGlobals = D.HasPendingGlobals;
+  HostDataToTargetMap = D.HostDataToTargetMap;
+  PendingCtorsDtors = D.PendingCtorsDtors;
+  ShadowPtrMap = D.ShadowPtrMap;
+  LoopTripCnt = D.LoopTripCnt;
+
+  return *this;
+}
+
+DeviceTy::DeviceTy(RTLInfoTy *RTL)
+    : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(),
+      HasPendingGlobals(false), HostDataToTargetMap(), PendingCtorsDtors(),
+      ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx() {}
+
+DeviceTy::~DeviceTy() {
+  if (DeviceID == -1 || !(getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE))
+    return;
+
+  ident_t loc = {0, 0, 0, 0, ";libomptarget;libomptarget;0;0;;"};
+  dumpTargetPointerMappings(&loc, *this);
+}
 
 int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) {
   DataMapMtx.lock();
 
   // Check if entry exists
-  for (auto &HT : HostDataToTargetMap) {
-    if ((uintptr_t)HstPtrBegin == HT.HstPtrBegin) {
-      // Mapping already exists
-      bool isValid = HT.HstPtrBegin == (uintptr_t) HstPtrBegin &&
-                     HT.HstPtrEnd == (uintptr_t) HstPtrBegin + Size &&
-                     HT.TgtPtrBegin == (uintptr_t) TgtPtrBegin;
-      DataMapMtx.unlock();
-      if (isValid) {
-        DP("Attempt to re-associate the same device ptr+offset with the same "
-            "host ptr, nothing to do\n");
-        return OFFLOAD_SUCCESS;
-      } else {
-        DP("Not allowed to re-associate a different device ptr+offset with the "
-            "same host ptr\n");
-        return OFFLOAD_FAIL;
-      }
+  auto search = HostDataToTargetMap.find(HstPtrBeginTy{(uintptr_t)HstPtrBegin});
+  if (search != HostDataToTargetMap.end()) {
+    // Mapping already exists
+    bool isValid = search->HstPtrEnd == (uintptr_t)HstPtrBegin + Size &&
+                   search->TgtPtrBegin == (uintptr_t)TgtPtrBegin;
+    DataMapMtx.unlock();
+    if (isValid) {
+      DP("Attempt to re-associate the same device ptr+offset with the same "
+         "host ptr, nothing to do\n");
+      return OFFLOAD_SUCCESS;
+    } else {
+      REPORT("Not allowed to re-associate a different device ptr+offset with "
+             "the same host ptr\n");
+      return OFFLOAD_FAIL;
     }
   }
 
-  // Mapping does not exist, allocate it
-  HostDataToTargetTy newEntry;
-
-  // Set up missing fields
-  newEntry.HstPtrBase = (uintptr_t) HstPtrBegin;
-  newEntry.HstPtrBegin = (uintptr_t) HstPtrBegin;
-  newEntry.HstPtrEnd = (uintptr_t) HstPtrBegin + Size;
-  newEntry.TgtPtrBegin = (uintptr_t) TgtPtrBegin;
-  // refCount must be infinite
-  newEntry.RefCount = INF_REF_CNT;
+  // Mapping does not exist, allocate it with refCount=INF
+  HostDataToTargetTy newEntry((uintptr_t)HstPtrBegin /*HstPtrBase*/,
+                              (uintptr_t)HstPtrBegin /*HstPtrBegin*/,
+                              (uintptr_t)HstPtrBegin + Size /*HstPtrEnd*/,
+                              (uintptr_t)TgtPtrBegin /*TgtPtrBegin*/, nullptr,
+                              true /*IsRefCountINF*/);
 
   DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", HstEnd="
       DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(newEntry.HstPtrBase),
       DPxPTR(newEntry.HstPtrBegin), DPxPTR(newEntry.HstPtrEnd),
       DPxPTR(newEntry.TgtPtrBegin));
-  HostDataToTargetMap.push_front(newEntry);
+  HostDataToTargetMap.insert(newEntry);
 
   DataMapMtx.unlock();
 
@@ -69,46 +96,45 @@ int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) {
 int DeviceTy::disassociatePtr(void *HstPtrBegin) {
   DataMapMtx.lock();
 
-  // Check if entry exists
-  for (HostDataToTargetListTy::iterator ii = HostDataToTargetMap.begin();
-      ii != HostDataToTargetMap.end(); ++ii) {
-    if ((uintptr_t)HstPtrBegin == ii->HstPtrBegin) {
-      // Mapping exists
-      if (CONSIDERED_INF(ii->RefCount)) {
-        DP("Association found, removing it\n");
-        HostDataToTargetMap.erase(ii);
-        DataMapMtx.unlock();
-        return OFFLOAD_SUCCESS;
-      } else {
-        DP("Trying to disassociate a pointer which was not mapped via "
-            "omp_target_associate_ptr\n");
-        break;
-      }
+  auto search = HostDataToTargetMap.find(HstPtrBeginTy{(uintptr_t)HstPtrBegin});
+  if (search != HostDataToTargetMap.end()) {
+    // Mapping exists
+    if (search->isRefCountInf()) {
+      DP("Association found, removing it\n");
+      HostDataToTargetMap.erase(search);
+      DataMapMtx.unlock();
+      return OFFLOAD_SUCCESS;
+    } else {
+      REPORT("Trying to disassociate a pointer which was not mapped via "
+             "omp_target_associate_ptr\n");
     }
   }
 
   // Mapping not found
   DataMapMtx.unlock();
-  DP("Association not found\n");
+  REPORT("Association not found\n");
   return OFFLOAD_FAIL;
 }
 
 // Get ref count of map entry containing HstPtrBegin
-long DeviceTy::getMapEntryRefCnt(void *HstPtrBegin) {
+uint64_t DeviceTy::getMapEntryRefCnt(void *HstPtrBegin) {
   uintptr_t hp = (uintptr_t)HstPtrBegin;
-  long RefCnt = -1;
+  uint64_t RefCnt = 0;
 
   DataMapMtx.lock();
-  for (auto &HT : HostDataToTargetMap) {
-    if (hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd) {
-      DP("DeviceTy::getMapEntry: requested entry found\n");
-      RefCnt = HT.RefCount;
-      break;
+  if (!HostDataToTargetMap.empty()) {
+    auto upper = HostDataToTargetMap.upper_bound(hp);
+    if (upper != HostDataToTargetMap.begin()) {
+      upper--;
+      if (hp >= upper->HstPtrBegin && hp < upper->HstPtrEnd) {
+        DP("DeviceTy::getMapEntry: requested entry found\n");
+        RefCnt = upper->getRefCount();
+      }
     }
   }
   DataMapMtx.unlock();
 
-  if (RefCnt < 0) {
+  if (RefCnt == 0) {
     DP("DeviceTy::getMapEntry: requested entry not found\n");
   }
 
@@ -119,23 +145,33 @@ LookupResult DeviceTy::lookupMapping(void *HstPtrBegin, int64_t Size) {
   uintptr_t hp = (uintptr_t)HstPtrBegin;
   LookupResult lr;
 
-  DP("Looking up mapping(HstPtrBegin=" DPxMOD ", Size=%ld)...\n", DPxPTR(hp),
-      Size);
-  for (lr.Entry = HostDataToTargetMap.begin();
-      lr.Entry != HostDataToTargetMap.end(); ++lr.Entry) {
+  DP("Looking up mapping(HstPtrBegin=" DPxMOD ", Size=%" PRId64 ")...\n",
+      DPxPTR(hp), Size);
+
+  if (HostDataToTargetMap.empty())
+    return lr;
+
+  auto upper = HostDataToTargetMap.upper_bound(hp);
+  // check the left bin
+  if (upper != HostDataToTargetMap.begin()) {
+    lr.Entry = std::prev(upper);
     auto &HT = *lr.Entry;
     // Is it contained?
     lr.Flags.IsContained = hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd &&
         (hp+Size) <= HT.HstPtrEnd;
+    // Does it extend beyond the mapped region?
+    lr.Flags.ExtendsAfter = hp < HT.HstPtrEnd && (hp + Size) > HT.HstPtrEnd;
+  }
+
+  // check the right bin
+  if (!(lr.Flags.IsContained || lr.Flags.ExtendsAfter) &&
+      upper != HostDataToTargetMap.end()) {
+    lr.Entry = upper;
+    auto &HT = *lr.Entry;
     // Does it extend into an already mapped region?
     lr.Flags.ExtendsBefore = hp < HT.HstPtrBegin && (hp+Size) > HT.HstPtrBegin;
     // Does it extend beyond the mapped region?
     lr.Flags.ExtendsAfter = hp < HT.HstPtrEnd && (hp+Size) > HT.HstPtrEnd;
-
-    if (lr.Flags.IsContained || lr.Flags.ExtendsBefore ||
-        lr.Flags.ExtendsAfter) {
-      break;
-    }
   }
 
   if (lr.Flags.ExtendsBefore) {
@@ -150,17 +186,20 @@ LookupResult DeviceTy::lookupMapping(void *HstPtrBegin, int64_t Size) {
   return lr;
 }
 
-// Used by target_data_begin
+// Used by targetDataBegin
 // Return the target pointer begin (where the data will be moved).
 // Allocate memory if this is the first occurrence of this mapping.
 // Increment the reference counter.
 // If NULL is returned, then either data allocation failed or the user tried
 // to do an illegal mapping.
 void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
-    int64_t Size, bool &IsNew, bool &IsHostPtr, bool IsImplicit,
-    bool UpdateRefCount, bool HasCloseModifier) {
+                                 int64_t Size, map_var_info_t HstPtrName,
+                                 bool &IsNew, bool &IsHostPtr, bool IsImplicit,
+                                 bool UpdateRefCount, bool HasCloseModifier,
+                                 bool HasPresentModifier) {
   void *rc = NULL;
   IsHostPtr = false;
+  IsNew = false;
   DataMapMtx.lock();
   LookupResult lr = lookupMapping(HstPtrBegin, Size);
 
@@ -174,78 +213,100 @@ void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
     IsNew = false;
 
     if (UpdateRefCount)
-      ++HT.RefCount;
+      HT.incRefCount();
 
     uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
-    DP("Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
-        "Size=%ld,%s RefCount=%s\n", (IsImplicit ? " (implicit)" : ""),
-        DPxPTR(HstPtrBegin), DPxPTR(tp), Size,
-        (UpdateRefCount ? " updated" : ""),
-        (CONSIDERED_INF(HT.RefCount)) ? "INF" :
-            std::to_string(HT.RefCount).c_str());
+    INFO(OMP_INFOTYPE_MAPPING_EXISTS, DeviceID,
+         "Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD
+         ", "
+         "Size=%" PRId64 ",%s RefCount=%s, Name=%s\n",
+         (IsImplicit ? " (implicit)" : ""), DPxPTR(HstPtrBegin), DPxPTR(tp),
+         Size, (UpdateRefCount ? " updated" : ""),
+         HT.isRefCountInf() ? "INF" : std::to_string(HT.getRefCount()).c_str(),
+         (HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown");
     rc = (void *)tp;
   } else if ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && !IsImplicit) {
     // Explicit extension of mapped data - not allowed.
-    DP("Explicit extension of mapping is not allowed.\n");
-  } else if (Size) {
-    // If unified shared memory is active, implicitly mapped variables that are not
-    // privatized use host address. Any explicitly mapped variables also use
-    // host address where correctness is not impeded. In all other cases
-    // maps are respected.
-    // In addition to the mapping rules above, the close map
-    // modifier forces the mapping of the variable to the device.
-    if (RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && !HasCloseModifier) {
-      DP("Return HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n",
-         DPxPTR((uintptr_t)HstPtrBegin), Size, (UpdateRefCount ? " updated" : ""));
+    MESSAGE("explicit extension not allowed: host address specified is " DPxMOD
+            " (%" PRId64 " bytes), but device allocation maps to host at "
+            DPxMOD " (%" PRId64 " bytes)",
+            DPxPTR(HstPtrBegin), Size, DPxPTR(lr.Entry->HstPtrBegin),
+            lr.Entry->HstPtrEnd - lr.Entry->HstPtrBegin);
+    if (HasPresentModifier)
+      MESSAGE("device mapping required by 'present' map type modifier does not "
+              "exist for host address " DPxMOD " (%" PRId64 " bytes)",
+              DPxPTR(HstPtrBegin), Size);
+  } else if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+             !HasCloseModifier) {
+    // If unified shared memory is active, implicitly mapped variables that are
+    // not privatized use host address. Any explicitly mapped variables also use
+    // host address where correctness is not impeded. In all other cases maps
+    // are respected.
+    // In addition to the mapping rules above, the close map modifier forces the
+    // mapping of the variable to the device.
+    if (Size) {
+      DP("Return HstPtrBegin " DPxMOD " Size=%" PRId64 " RefCount=%s\n",
+         DPxPTR((uintptr_t)HstPtrBegin), Size,
+         (UpdateRefCount ? " updated" : ""));
       IsHostPtr = true;
       rc = HstPtrBegin;
-    } else {
-      // If it is not contained and Size > 0 we should create a new entry for it.
-      IsNew = true;
-      uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin);
-      DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", "
-         "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase),
-         DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp));
-      HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase,
-          (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp));
-      rc = (void *)tp;
     }
+  } else if (HasPresentModifier) {
+    DP("Mapping required by 'present' map type modifier does not exist for "
+       "HstPtrBegin=" DPxMOD ", Size=%" PRId64 "\n",
+       DPxPTR(HstPtrBegin), Size);
+    MESSAGE("device mapping required by 'present' map type modifier does not "
+            "exist for host address " DPxMOD " (%" PRId64 " bytes)",
+            DPxPTR(HstPtrBegin), Size);
+  } else if (Size) {
+    // If it is not contained and Size > 0, we should create a new entry for it.
+    IsNew = true;
+    uintptr_t tp = (uintptr_t)allocData(Size, HstPtrBegin);
+    DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", "
+       "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n",
+       DPxPTR(HstPtrBase), DPxPTR(HstPtrBegin),
+       DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp));
+    HostDataToTargetMap.emplace(
+        HostDataToTargetTy((uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin,
+                           (uintptr_t)HstPtrBegin + Size, tp, HstPtrName));
+    rc = (void *)tp;
   }
 
   DataMapMtx.unlock();
   return rc;
 }
 
-// Used by target_data_begin, target_data_end, target_data_update and target.
+// Used by targetDataBegin, targetDataEnd, targetDataUpdate and target.
 // Return the target pointer begin (where the data will be moved).
-// Decrement the reference counter if called from target_data_end.
+// Decrement the reference counter if called from targetDataEnd.
 void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
-    bool UpdateRefCount, bool &IsHostPtr) {
+                               bool UpdateRefCount, bool &IsHostPtr,
+                               bool MustContain) {
   void *rc = NULL;
   IsHostPtr = false;
   IsLast = false;
   DataMapMtx.lock();
   LookupResult lr = lookupMapping(HstPtrBegin, Size);
 
-  if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
+  if (lr.Flags.IsContained ||
+      (!MustContain && (lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter))) {
     auto &HT = *lr.Entry;
-    IsLast = !(HT.RefCount > 1);
+    IsLast = HT.getRefCount() == 1;
 
-    if (HT.RefCount > 1 && UpdateRefCount)
-      --HT.RefCount;
+    if (!IsLast && UpdateRefCount)
+      HT.decRefCount();
 
     uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
     DP("Mapping exists with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
-        "Size=%ld,%s RefCount=%s\n", DPxPTR(HstPtrBegin), DPxPTR(tp), Size,
-        (UpdateRefCount ? " updated" : ""),
-        (CONSIDERED_INF(HT.RefCount)) ? "INF" :
-            std::to_string(HT.RefCount).c_str());
+        "Size=%" PRId64 ",%s RefCount=%s\n", DPxPTR(HstPtrBegin), DPxPTR(tp),
+        Size, (UpdateRefCount ? " updated" : ""),
+        HT.isRefCountInf() ? "INF" : std::to_string(HT.getRefCount()).c_str());
     rc = (void *)tp;
-  } else if (RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+  } else if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
     // If the value isn't found in the mapping and unified shared memory
     // is on then it means we have stumbled upon a value which we need to
     // use directly from the host.
-    DP("Get HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n",
+    DP("Get HstPtrBegin " DPxMOD " Size=%" PRId64 " RefCount=%s\n",
        DPxPTR((uintptr_t)HstPtrBegin), Size, (UpdateRefCount ? " updated" : ""));
     IsHostPtr = true;
     rc = HstPtrBegin;
@@ -271,7 +332,8 @@ void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size) {
 
 int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete,
                             bool HasCloseModifier) {
-  if (RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && !HasCloseModifier)
+  if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+      !HasCloseModifier)
     return OFFLOAD_SUCCESS;
   // Check if the pointer is contained in any sub-nodes.
   int rc;
@@ -280,21 +342,21 @@ int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete,
   if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
     auto &HT = *lr.Entry;
     if (ForceDelete)
-      HT.RefCount = 1;
-    if (--HT.RefCount <= 0) {
-      assert(HT.RefCount == 0 && "did not expect a negative ref count");
-      DP("Deleting tgt data " DPxMOD " of size %ld\n",
+      HT.resetRefCount();
+    if (HT.decRefCount() == 0) {
+      DP("Deleting tgt data " DPxMOD " of size %" PRId64 "\n",
           DPxPTR(HT.TgtPtrBegin), Size);
-      RTL->data_delete(RTLDeviceID, (void *)HT.TgtPtrBegin);
+      deleteData((void *)HT.TgtPtrBegin);
       DP("Removing%s mapping with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD
-          ", Size=%ld\n", (ForceDelete ? " (forced)" : ""),
+          ", Size=%" PRId64 "\n", (ForceDelete ? " (forced)" : ""),
           DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size);
       HostDataToTargetMap.erase(lr.Entry);
     }
     rc = OFFLOAD_SUCCESS;
   } else {
-    DP("Section to delete (hst addr " DPxMOD ") does not exist in the allocated"
-       " memory\n", DPxPTR(HstPtrBegin));
+    REPORT("Section to delete (hst addr " DPxMOD ") does not exist in the"
+           " allocated memory\n",
+           DPxPTR(HstPtrBegin));
     rc = OFFLOAD_FAIL;
   }
 
@@ -306,11 +368,12 @@ int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete,
 void DeviceTy::init() {
   // Make call to init_requires if it exists for this plugin.
   if (RTL->init_requires)
-    RTL->init_requires(RTLs.RequiresFlags);
-  int32_t rc = RTL->init_device(RTLDeviceID);
-  if (rc == OFFLOAD_SUCCESS) {
-    IsInit = true;
-  }
+    RTL->init_requires(PM->RTLs.RequiresFlags);
+  int32_t Ret = RTL->init_device(RTLDeviceID);
+  if (Ret != OFFLOAD_SUCCESS)
+    return;
+
+  IsInit = true;
 }
 
 /// Thread-safe method to initialize the device only once.
@@ -337,31 +400,90 @@ __tgt_target_table *DeviceTy::load_binary(void *Img) {
   return rc;
 }
 
-// Submit data to device.
-int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin,
-    int64_t Size) {
-  return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
+void *DeviceTy::allocData(int64_t Size, void *HstPtr) {
+  return RTL->data_alloc(RTLDeviceID, Size, HstPtr);
+}
+
+int32_t DeviceTy::deleteData(void *TgtPtrBegin) {
+  return RTL->data_delete(RTLDeviceID, TgtPtrBegin);
+}
+
+// Submit data to device
+int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
+                             __tgt_async_info *AsyncInfoPtr) {
+  if (!AsyncInfoPtr || !RTL->data_submit_async || !RTL->synchronize)
+    return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
+  else
+    return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size,
+                                  AsyncInfoPtr);
+}
+
+// Retrieve data from device
+int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
+                               int64_t Size, __tgt_async_info *AsyncInfoPtr) {
+  if (!AsyncInfoPtr || !RTL->data_retrieve_async || !RTL->synchronize)
+    return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
+  else
+    return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
+                                    AsyncInfoPtr);
 }
 
-// Retrieve data from device.
-int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin,
-    int64_t Size) {
-  return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
+// Copy data from current device to destination device directly
+int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
+                               int64_t Size, __tgt_async_info *AsyncInfo) {
+  if (!AsyncInfo || !RTL->data_exchange_async || !RTL->synchronize) {
+    assert(RTL->data_exchange && "RTL->data_exchange is nullptr");
+    return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr,
+                              Size);
+  } else
+    return RTL->data_exchange_async(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID,
+                                    DstPtr, Size, AsyncInfo);
 }
 
 // Run region on device
-int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr,
-    ptrdiff_t *TgtOffsets, int32_t TgtVarsSize) {
-  return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
-      TgtVarsSize);
+int32_t DeviceTy::runRegion(void *TgtEntryPtr, void **TgtVarsPtr,
+                            ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
+                            __tgt_async_info *AsyncInfoPtr) {
+  if (!AsyncInfoPtr || !RTL->run_region || !RTL->synchronize)
+    return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
+                           TgtVarsSize);
+  else
+    return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
+                                 TgtOffsets, TgtVarsSize, AsyncInfoPtr);
 }
 
 // Run team region on device.
-int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
-    ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams,
-    int32_t ThreadLimit, uint64_t LoopTripCount) {
-  return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
-      TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount);
+int32_t DeviceTy::runTeamRegion(void *TgtEntryPtr, void **TgtVarsPtr,
+                                ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
+                                int32_t NumTeams, int32_t ThreadLimit,
+                                uint64_t LoopTripCount,
+                                __tgt_async_info *AsyncInfoPtr) {
+  if (!AsyncInfoPtr || !RTL->run_team_region_async || !RTL->synchronize)
+    return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
+                                TgtOffsets, TgtVarsSize, NumTeams, ThreadLimit,
+                                LoopTripCount);
+  else
+    return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
+                                      TgtOffsets, TgtVarsSize, NumTeams,
+                                      ThreadLimit, LoopTripCount, AsyncInfoPtr);
+}
+
+// Whether data can be copied to DstDevice directly
+bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) {
+  if (RTL != DstDevice.RTL || !RTL->is_data_exchangable)
+    return false;
+
+  if (RTL->is_data_exchangable(RTLDeviceID, DstDevice.RTLDeviceID))
+    return (RTL->data_exchange != nullptr) ||
+           (RTL->data_exchange_async != nullptr);
+
+  return false;
+}
+
+int32_t DeviceTy::synchronize(__tgt_async_info *AsyncInfoPtr) {
+  if (RTL->synchronize)
+    return RTL->synchronize(RTLDeviceID, AsyncInfoPtr);
+  return OFFLOAD_SUCCESS;
 }
 
 /// Check whether a device has an associated RTL and initialize it if it's not
@@ -370,16 +492,16 @@ bool device_is_ready(int device_num) {
   DP("Checking whether device %d is ready.\n", device_num);
   // Devices.size() can only change while registering a new
   // library, so try to acquire the lock of RTLs' mutex.
-  RTLsMtx.lock();
-  size_t Devices_size = Devices.size();
-  RTLsMtx.unlock();
-  if (Devices_size <= (size_t)device_num) {
+  PM->RTLsMtx.lock();
+  size_t DevicesSize = PM->Devices.size();
+  PM->RTLsMtx.unlock();
+  if (DevicesSize <= (size_t)device_num) {
     DP("Device ID  %d does not have a matching RTL\n", device_num);
     return false;
   }
 
   // Get device info
-  DeviceTy &Device = Devices[device_num];
+  DeviceTy &Device = PM->Devices[device_num];
 
   DP("Is the device %d (local ID %d) initialized? %d\n", device_num,
        Device.RTLDeviceID, Device.IsInit);
diff --git a/libomptarget/src/device.h b/libomptarget/src/device.h
index d33512bb0..e79c3bad4 100644
--- a/libomptarget/src/device.h
+++ b/libomptarget/src/device.h
@@ -13,44 +13,100 @@
 #ifndef _OMPTARGET_DEVICE_H
 #define _OMPTARGET_DEVICE_H
 
+#include <cassert>
 #include <cstddef>
-#include <climits>
 #include <list>
 #include <map>
+#include <memory>
 #include <mutex>
+#include <set>
 #include <vector>
 
+#include "rtl.h"
+
 // Forward declarations.
 struct RTLInfoTy;
 struct __tgt_bin_desc;
 struct __tgt_target_table;
+struct __tgt_async_info;
+
+using map_var_info_t = void *;
 
-#define INF_REF_CNT (LONG_MAX>>1) // leave room for additions/subtractions
-#define CONSIDERED_INF(x) (x > (INF_REF_CNT>>1))
+// enum for OMP_TARGET_OFFLOAD; keep in sync with kmp.h definition
+enum kmp_target_offload_kind {
+  tgt_disabled = 0,
+  tgt_default = 1,
+  tgt_mandatory = 2
+};
+typedef enum kmp_target_offload_kind kmp_target_offload_kind_t;
 
 /// Map between host data and target data.
 struct HostDataToTargetTy {
   uintptr_t HstPtrBase; // host info.
   uintptr_t HstPtrBegin;
   uintptr_t HstPtrEnd; // non-inclusive.
+  map_var_info_t HstPtrName; // Optional source name of mapped variable.
 
   uintptr_t TgtPtrBegin; // target info.
 
-  long RefCount;
+private:
+  /// use mutable to allow modification via std::set iterator which is const.
+  mutable uint64_t RefCount;
+  static const uint64_t INFRefCount = ~(uint64_t)0;
 
-  HostDataToTargetTy()
-      : HstPtrBase(0), HstPtrBegin(0), HstPtrEnd(0),
-        TgtPtrBegin(0), RefCount(0) {}
-  HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB)
-      : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E),
-        TgtPtrBegin(TB), RefCount(1) {}
+public:
   HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB,
-      long RF)
-      : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E),
-        TgtPtrBegin(TB), RefCount(RF) {}
+                     map_var_info_t Name = nullptr, bool IsINF = false)
+      : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E), HstPtrName(Name),
+        TgtPtrBegin(TB), RefCount(IsINF ? INFRefCount : 1) {}
+
+  uint64_t getRefCount() const {
+    return RefCount;
+  }
+
+  uint64_t resetRefCount() const {
+    if (RefCount != INFRefCount)
+      RefCount = 1;
+
+    return RefCount;
+  }
+
+  uint64_t incRefCount() const {
+    if (RefCount != INFRefCount) {
+      ++RefCount;
+      assert(RefCount < INFRefCount && "refcount overflow");
+    }
+
+    return RefCount;
+  }
+
+  uint64_t decRefCount() const {
+    if (RefCount != INFRefCount) {
+      assert(RefCount > 0 && "refcount underflow");
+      --RefCount;
+    }
+
+    return RefCount;
+  }
+
+  bool isRefCountInf() const {
+    return RefCount == INFRefCount;
+  }
 };
 
-typedef std::list<HostDataToTargetTy> HostDataToTargetListTy;
+typedef uintptr_t HstPtrBeginTy;
+inline bool operator<(const HostDataToTargetTy &lhs, const HstPtrBeginTy &rhs) {
+  return lhs.HstPtrBegin < rhs;
+}
+inline bool operator<(const HstPtrBeginTy &lhs, const HostDataToTargetTy &rhs) {
+  return lhs < rhs.HstPtrBegin;
+}
+inline bool operator<(const HostDataToTargetTy &lhs,
+                      const HostDataToTargetTy &rhs) {
+  return lhs.HstPtrBegin < rhs.HstPtrBegin;
+}
+
+typedef std::set<HostDataToTargetTy, std::less<>> HostDataToTargetListTy;
 
 struct LookupResult {
   struct {
@@ -100,43 +156,29 @@ struct DeviceTy {
   // moved into the target task in libomp.
   std::map<int32_t, uint64_t> LoopTripCnt;
 
-  DeviceTy(RTLInfoTy *RTL)
-      : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(),
-        HasPendingGlobals(false), HostDataToTargetMap(), PendingCtorsDtors(),
-        ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx() {}
+  DeviceTy(RTLInfoTy *RTL);
 
   // The existence of mutexes makes DeviceTy non-copyable. We need to
   // provide a copy constructor and an assignment operator explicitly.
-  DeviceTy(const DeviceTy &d)
-      : DeviceID(d.DeviceID), RTL(d.RTL), RTLDeviceID(d.RTLDeviceID),
-        IsInit(d.IsInit), InitFlag(), HasPendingGlobals(d.HasPendingGlobals),
-        HostDataToTargetMap(d.HostDataToTargetMap),
-        PendingCtorsDtors(d.PendingCtorsDtors), ShadowPtrMap(d.ShadowPtrMap),
-        DataMapMtx(), PendingGlobalsMtx(), ShadowMtx(),
-        LoopTripCnt(d.LoopTripCnt) {}
-
-  DeviceTy& operator=(const DeviceTy &d) {
-    DeviceID = d.DeviceID;
-    RTL = d.RTL;
-    RTLDeviceID = d.RTLDeviceID;
-    IsInit = d.IsInit;
-    HasPendingGlobals = d.HasPendingGlobals;
-    HostDataToTargetMap = d.HostDataToTargetMap;
-    PendingCtorsDtors = d.PendingCtorsDtors;
-    ShadowPtrMap = d.ShadowPtrMap;
-    LoopTripCnt = d.LoopTripCnt;
-
-    return *this;
-  }
+  DeviceTy(const DeviceTy &D);
 
-  long getMapEntryRefCnt(void *HstPtrBegin);
+  DeviceTy &operator=(const DeviceTy &D);
+
+  ~DeviceTy();
+
+  // Return true if data can be copied to DstDevice directly
+  bool isDataExchangable(const DeviceTy& DstDevice);
+
+  uint64_t getMapEntryRefCnt(void *HstPtrBegin);
   LookupResult lookupMapping(void *HstPtrBegin, int64_t Size);
   void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size,
-      bool &IsNew, bool &IsHostPtr, bool IsImplicit, bool UpdateRefCount = true,
-      bool HasCloseModifier = false);
+                         map_var_info_t HstPtrName, bool &IsNew,
+                         bool &IsHostPtr, bool IsImplicit, bool UpdateRefCount,
+                         bool HasCloseModifier, bool HasPresentModifier);
   void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size);
   void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
-      bool UpdateRefCount, bool &IsHostPtr);
+                       bool UpdateRefCount, bool &IsHostPtr,
+                       bool MustContain = false);
   int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete,
                     bool HasCloseModifier = false);
   int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
@@ -146,14 +188,40 @@ struct DeviceTy {
   int32_t initOnce();
   __tgt_target_table *load_binary(void *Img);
 
-  int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size);
-  int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
-
-  int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr,
-      ptrdiff_t *TgtOffsets, int32_t TgtVarsSize);
-  int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
-      ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams,
-      int32_t ThreadLimit, uint64_t LoopTripCount);
+  // device memory allocation/deallocation routines
+  /// Allocates \p Size bytes on the device and returns the address/nullptr when
+  /// succeeds/fails. \p HstPtr is an address of the host data which the
+  /// allocated target data will be associated with. If it is unknown, the
+  /// default value of \p HstPtr is nullptr. Note: this function doesn't do
+  /// pointer association. Actually, all the __tgt_rtl_data_alloc
+  /// implementations ignore \p HstPtr.
+  void *allocData(int64_t Size, void *HstPtr = nullptr);
+  /// Deallocates memory which \p TgtPtrBegin points at and returns
+  /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails.
+  int32_t deleteData(void *TgtPtrBegin);
+
+  // Data transfer. When AsyncInfoPtr is nullptr, the transfer will be
+  // synchronous.
+  // Copy data from host to device
+  int32_t submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
+                     __tgt_async_info *AsyncInfoPtr);
+  // Copy data from device back to host
+  int32_t retrieveData(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size,
+                       __tgt_async_info *AsyncInfoPtr);
+  // Copy data from current device to destination device directly
+  int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
+                       int64_t Size, __tgt_async_info *AsyncInfo);
+
+  int32_t runRegion(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets,
+                    int32_t TgtVarsSize, __tgt_async_info *AsyncInfoPtr);
+  int32_t runTeamRegion(void *TgtEntryPtr, void **TgtVarsPtr,
+                        ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
+                        int32_t NumTeams, int32_t ThreadLimit,
+                        uint64_t LoopTripCount, __tgt_async_info *AsyncInfoPtr);
+
+  /// Synchronize device/queue/event based on \p AsyncInfoPtr and return
+  /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails.
+  int32_t synchronize(__tgt_async_info *AsyncInfoPtr);
 
 private:
   // Call to RTL
@@ -162,8 +230,31 @@ struct DeviceTy {
 
 /// Map between Device ID (i.e. openmp device id) and its DeviceTy.
 typedef std::vector<DeviceTy> DevicesTy;
-extern DevicesTy Devices;
 
 extern bool device_is_ready(int device_num);
 
+/// Struct for the data required to handle plugins
+struct PluginManager {
+  /// RTLs identified on the host
+  RTLsTy RTLs;
+
+  /// Devices associated with RTLs
+  DevicesTy Devices;
+  std::mutex RTLsMtx; ///< For RTLs and Devices
+
+  /// Translation table retreived from the binary
+  HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable;
+  std::mutex TrlTblMtx; ///< For Translation Table
+
+  /// Map from ptrs on the host to an entry in the Translation Table
+  HostPtrToTableMapTy HostPtrToTableMap;
+  std::mutex TblMapMtx; ///< For HostPtrToTableMap
+
+  // Store target policy (disabled, mandatory, default)
+  kmp_target_offload_kind_t TargetOffloadPolicy = tgt_default;
+  std::mutex TargetOffloadMtx; ///< For TargetOffloadPolicy
+};
+
+extern PluginManager *PM;
+
 #endif
diff --git a/libomptarget/src/exports b/libomptarget/src/exports
index e1fee4bbe..5e09a0885 100644
--- a/libomptarget/src/exports
+++ b/libomptarget/src/exports
@@ -13,6 +13,16 @@ VERS1.0 {
     __tgt_target_data_update_nowait;
     __tgt_target_nowait;
     __tgt_target_teams_nowait;
+    __tgt_target_data_begin_mapper;
+    __tgt_target_data_end_mapper;
+    __tgt_target_data_update_mapper;
+    __tgt_target_mapper;
+    __tgt_target_teams_mapper;
+    __tgt_target_data_begin_nowait_mapper;
+    __tgt_target_data_end_nowait_mapper;
+    __tgt_target_data_update_nowait_mapper;
+    __tgt_target_nowait_mapper;
+    __tgt_target_teams_nowait_mapper;
     __tgt_mapper_num_components;
     __tgt_push_mapper_component;
     omp_get_num_devices;
diff --git a/libomptarget/src/interface.cpp b/libomptarget/src/interface.cpp
index 59cf454e8..c773e1fda 100644
--- a/libomptarget/src/interface.cpp
+++ b/libomptarget/src/interface.cpp
@@ -11,79 +11,97 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <omptarget.h>
-
 #include "device.h"
 #include "private.h"
 #include "rtl.h"
 
 #include <cassert>
+#include <cstdio>
 #include <cstdlib>
 #include <mutex>
 
-// Store target policy (disabled, mandatory, default)
-kmp_target_offload_kind_t TargetOffloadPolicy = tgt_default;
-std::mutex TargetOffloadMtx;
-
 ////////////////////////////////////////////////////////////////////////////////
-/// manage the success or failure of a target constuct
-
+/// manage the success or failure of a target construct
 static void HandleDefaultTargetOffload() {
-  TargetOffloadMtx.lock();
-  if (TargetOffloadPolicy == tgt_default) {
+  PM->TargetOffloadMtx.lock();
+  if (PM->TargetOffloadPolicy == tgt_default) {
     if (omp_get_num_devices() > 0) {
       DP("Default TARGET OFFLOAD policy is now mandatory "
          "(devices were found)\n");
-      TargetOffloadPolicy = tgt_mandatory;
+      PM->TargetOffloadPolicy = tgt_mandatory;
     } else {
       DP("Default TARGET OFFLOAD policy is now disabled "
          "(no devices were found)\n");
-      TargetOffloadPolicy = tgt_disabled;
+      PM->TargetOffloadPolicy = tgt_disabled;
     }
   }
-  TargetOffloadMtx.unlock();
+  PM->TargetOffloadMtx.unlock();
 }
 
 static int IsOffloadDisabled() {
-  if (TargetOffloadPolicy == tgt_default) HandleDefaultTargetOffload();
-  return TargetOffloadPolicy == tgt_disabled;
+  if (PM->TargetOffloadPolicy == tgt_default)
+    HandleDefaultTargetOffload();
+  return PM->TargetOffloadPolicy == tgt_disabled;
 }
 
-static void HandleTargetOutcome(bool success) {
-  switch (TargetOffloadPolicy) {
-    case tgt_disabled:
-      if (success) {
-        FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
-      }
-      break;
-    case tgt_default:
-      FATAL_MESSAGE0(1, "default offloading policy must be switched to "
-                        "mandatory or disabled");
-      break;
-    case tgt_mandatory:
-      if (!success) {
-        FATAL_MESSAGE0(1, "failure of target construct while offloading is mandatory");
-      }
-      break;
+static void HandleTargetOutcome(bool success, ident_t *loc = nullptr) {
+  switch (PM->TargetOffloadPolicy) {
+  case tgt_disabled:
+    if (success) {
+      FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
+    }
+    break;
+  case tgt_default:
+    FATAL_MESSAGE0(1, "default offloading policy must be switched to "
+                      "mandatory or disabled");
+    break;
+  case tgt_mandatory:
+    if (!success) {
+      if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
+        for (const auto &Device : PM->Devices)
+          dumpTargetPointerMappings(loc, Device);
+      else
+        FAILURE_MESSAGE("Run with LIBOMPTARGET_DEBUG=%d to dump host-target "
+                        "pointer mappings.\n",
+                        OMP_INFOTYPE_DUMP_TABLE);
+
+      SourceInfo info(loc);
+      if (info.isAvailible())
+        fprintf(stderr, "%s:%d:%d: ", info.getFilename(), info.getLine(),
+                info.getColumn());
+      else
+        FAILURE_MESSAGE("Source location information not present. Compile with "
+                        "-g or -gline-tables-only.\n");
+      FATAL_MESSAGE0(
+          1, "failure of target construct while offloading is mandatory");
+    } else {
+      if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
+        for (const auto &Device : PM->Devices)
+          dumpTargetPointerMappings(loc, Device);
+    }
+    break;
   }
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 /// adds requires flags
 EXTERN void __tgt_register_requires(int64_t flags) {
-  RTLs.RegisterRequires(flags);
+  TIMESCOPE();
+  PM->RTLs.RegisterRequires(flags);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 /// adds a target shared library to the target execution image
 EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) {
-  RTLs.RegisterLib(desc);
+  TIMESCOPE();
+  PM->RTLs.RegisterLib(desc);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 /// unloads a target shared library
 EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc) {
-  RTLs.UnregisterLib(desc);
+  TIMESCOPE();
+  PM->RTLs.UnregisterLib(desc);
 }
 
 /// creates host-to-target data mapping, stores it in the
@@ -91,6 +109,30 @@ EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc) {
 /// and passes the data to the device.
 EXTERN void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
     void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+  TIMESCOPE();
+  __tgt_target_data_begin_mapper(nullptr, device_id, arg_num, args_base, args,
+                                 arg_sizes, arg_types, nullptr, nullptr);
+}
+
+EXTERN void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
+    int32_t depNum, void *depList, int32_t noAliasDepNum,
+    void *noAliasDepList) {
+  TIMESCOPE();
+  if (depNum + noAliasDepNum > 0)
+    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
+
+  __tgt_target_data_begin_mapper(nullptr, device_id, arg_num, args_base, args,
+                                 arg_sizes, arg_types, nullptr, nullptr);
+}
+
+EXTERN void __tgt_target_data_begin_mapper(ident_t *loc, int64_t device_id,
+                                           int32_t arg_num, void **args_base,
+                                           void **args, int64_t *arg_sizes,
+                                           int64_t *arg_types,
+                                           map_var_info_t *arg_names,
+                                           void **arg_mappers) {
+  TIMESCOPE();
   if (IsOffloadDisabled()) return;
 
   DP("Entering data begin region for device %" PRId64 " with %d mappings\n",
@@ -104,34 +146,40 @@ EXTERN void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
 
   if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
     DP("Failed to get device %" PRId64 " ready\n", device_id);
-    HandleTargetOutcome(false);
+    HandleTargetOutcome(false, loc);
     return;
   }
 
-  DeviceTy& Device = Devices[device_id];
+  DeviceTy &Device = PM->Devices[device_id];
 
+  if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
+    printKernelArguments(loc, device_id, arg_num, arg_sizes, arg_types,
+                         arg_names, "Entering OpenMP data region");
 #ifdef OMPTARGET_DEBUG
-  for (int i=0; i<arg_num; ++i) {
+  for (int i = 0; i < arg_num; ++i) {
     DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
-        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
-        arg_sizes[i], arg_types[i]);
+       ", Type=0x%" PRIx64 ", Name=%s\n",
+       i, DPxPTR(args_base[i]), DPxPTR(args[i]), arg_sizes[i], arg_types[i],
+       (arg_names) ? getNameFromMapping(arg_names[i]).c_str() : "unknown");
   }
 #endif
 
-  int rc = target_data_begin(Device, arg_num, args_base,
-      args, arg_sizes, arg_types);
-  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
+  int rc = targetDataBegin(Device, arg_num, args_base, args, arg_sizes,
+                           arg_types, arg_names, arg_mappers, nullptr);
+  HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
 }
 
-EXTERN void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
-    int32_t depNum, void *depList, int32_t noAliasDepNum,
-    void *noAliasDepList) {
+EXTERN void __tgt_target_data_begin_nowait_mapper(
+    ident_t *loc, int64_t device_id, int32_t arg_num, void **args_base,
+    void **args, int64_t *arg_sizes, int64_t *arg_types,
+    map_var_info_t *arg_names, void **arg_mappers, int32_t depNum,
+    void *depList, int32_t noAliasDepNum, void *noAliasDepList) {
+  TIMESCOPE();
   if (depNum + noAliasDepNum > 0)
-    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
+    __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc));
 
-  __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes,
-                          arg_types);
+  __tgt_target_data_begin_mapper(loc, device_id, arg_num, args_base, args,
+                                 arg_sizes, arg_types, arg_names, arg_mappers);
 }
 
 /// passes data from the target, releases target memory and destroys
@@ -139,6 +187,30 @@ EXTERN void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num,
 /// created by the last __tgt_target_data_begin.
 EXTERN void __tgt_target_data_end(int64_t device_id, int32_t arg_num,
     void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+  TIMESCOPE();
+  __tgt_target_data_end_mapper(nullptr, device_id, arg_num, args_base, args,
+                               arg_sizes, arg_types, nullptr, nullptr);
+}
+
+EXTERN void __tgt_target_data_end_nowait(int64_t device_id, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
+    int32_t depNum, void *depList, int32_t noAliasDepNum,
+    void *noAliasDepList) {
+  TIMESCOPE();
+  if (depNum + noAliasDepNum > 0)
+    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
+
+  __tgt_target_data_end_mapper(nullptr, device_id, arg_num, args_base, args,
+                               arg_sizes, arg_types, nullptr, nullptr);
+}
+
+EXTERN void __tgt_target_data_end_mapper(ident_t *loc, int64_t device_id,
+                                         int32_t arg_num, void **args_base,
+                                         void **args, int64_t *arg_sizes,
+                                         int64_t *arg_types,
+                                         map_var_info_t *arg_names,
+                                         void **arg_mappers) {
+  TIMESCOPE();
   if (IsOffloadDisabled()) return;
   DP("Entering data end region with %d mappings\n", arg_num);
 
@@ -147,48 +219,78 @@ EXTERN void __tgt_target_data_end(int64_t device_id, int32_t arg_num,
     device_id = omp_get_default_device();
   }
 
-  RTLsMtx.lock();
-  size_t Devices_size = Devices.size();
-  RTLsMtx.unlock();
-  if (Devices_size <= (size_t)device_id) {
+  PM->RTLsMtx.lock();
+  size_t DevicesSize = PM->Devices.size();
+  PM->RTLsMtx.unlock();
+  if (DevicesSize <= (size_t)device_id) {
     DP("Device ID  %" PRId64 " does not have a matching RTL.\n", device_id);
-    HandleTargetOutcome(false);
+    HandleTargetOutcome(false, loc);
     return;
   }
 
-  DeviceTy &Device = Devices[device_id];
+  DeviceTy &Device = PM->Devices[device_id];
   if (!Device.IsInit) {
     DP("Uninit device: ignore");
-    HandleTargetOutcome(false);
+    HandleTargetOutcome(false, loc);
     return;
   }
 
+  if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
+    printKernelArguments(loc, device_id, arg_num, arg_sizes, arg_types,
+                         arg_names, "Exiting OpenMP data region");
 #ifdef OMPTARGET_DEBUG
   for (int i=0; i<arg_num; ++i) {
     DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
-        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
-        arg_sizes[i], arg_types[i]);
+       ", Type=0x%" PRIx64 ", Name=%s\n",
+       i, DPxPTR(args_base[i]), DPxPTR(args[i]), arg_sizes[i], arg_types[i],
+       (arg_names) ? getNameFromMapping(arg_names[i]).c_str() : "unknown");
   }
 #endif
 
-  int rc = target_data_end(Device, arg_num, args_base,
-      args, arg_sizes, arg_types);
-  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
+  int rc = targetDataEnd(Device, arg_num, args_base, args, arg_sizes, arg_types,
+                         arg_names, arg_mappers, nullptr);
+  HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
 }
 
-EXTERN void __tgt_target_data_end_nowait(int64_t device_id, int32_t arg_num,
+EXTERN void __tgt_target_data_end_nowait_mapper(
+    ident_t *loc, int64_t device_id, int32_t arg_num, void **args_base,
+    void **args, int64_t *arg_sizes, int64_t *arg_types,
+    map_var_info_t *arg_names, void **arg_mappers, int32_t depNum,
+    void *depList, int32_t noAliasDepNum, void *noAliasDepList) {
+  TIMESCOPE();
+  if (depNum + noAliasDepNum > 0)
+    __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc));
+
+  __tgt_target_data_end_mapper(loc, device_id, arg_num, args_base, args,
+                               arg_sizes, arg_types, arg_names, arg_mappers);
+}
+
+EXTERN void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+  TIMESCOPE();
+  __tgt_target_data_update_mapper(nullptr, device_id, arg_num, args_base, args,
+                                  arg_sizes, arg_types, nullptr, nullptr);
+}
+
+EXTERN void __tgt_target_data_update_nowait(int64_t device_id, int32_t arg_num,
     void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
     int32_t depNum, void *depList, int32_t noAliasDepNum,
     void *noAliasDepList) {
+  TIMESCOPE();
   if (depNum + noAliasDepNum > 0)
     __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
 
-  __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes,
-                        arg_types);
+  __tgt_target_data_update_mapper(nullptr, device_id, arg_num, args_base, args,
+                                  arg_sizes, arg_types, nullptr, nullptr);
 }
 
-EXTERN void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+EXTERN void __tgt_target_data_update_mapper(ident_t *loc, int64_t device_id,
+                                            int32_t arg_num, void **args_base,
+                                            void **args, int64_t *arg_sizes,
+                                            int64_t *arg_types,
+                                            map_var_info_t *arg_names,
+                                            void **arg_mappers) {
+  TIMESCOPE();
   if (IsOffloadDisabled()) return;
   DP("Entering data update with %d mappings\n", arg_num);
 
@@ -199,29 +301,57 @@ EXTERN void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
 
   if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
     DP("Failed to get device %" PRId64 " ready\n", device_id);
-    HandleTargetOutcome(false);
+    HandleTargetOutcome(false, loc);
     return;
   }
 
-  DeviceTy& Device = Devices[device_id];
-  int rc = target_data_update(Device, arg_num, args_base,
-      args, arg_sizes, arg_types);
-  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
+  if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
+    printKernelArguments(loc, device_id, arg_num, arg_sizes, arg_types,
+                         arg_names, "Updating OpenMP data");
+
+  DeviceTy &Device = PM->Devices[device_id];
+  int rc = targetDataUpdate(Device, arg_num, args_base, args, arg_sizes,
+                            arg_types, arg_names, arg_mappers);
+  HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
 }
 
-EXTERN void __tgt_target_data_update_nowait(
-    int64_t device_id, int32_t arg_num, void **args_base, void **args,
-    int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList,
-    int32_t noAliasDepNum, void *noAliasDepList) {
+EXTERN void __tgt_target_data_update_nowait_mapper(
+    ident_t *loc, int64_t device_id, int32_t arg_num, void **args_base,
+    void **args, int64_t *arg_sizes, int64_t *arg_types,
+    map_var_info_t *arg_names, void **arg_mappers, int32_t depNum,
+    void *depList, int32_t noAliasDepNum, void *noAliasDepList) {
+  TIMESCOPE();
   if (depNum + noAliasDepNum > 0)
-    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
+    __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc));
 
-  __tgt_target_data_update(device_id, arg_num, args_base, args, arg_sizes,
-                           arg_types);
+  __tgt_target_data_update_mapper(loc, device_id, arg_num, args_base, args,
+                                  arg_sizes, arg_types, arg_names, arg_mappers);
 }
 
 EXTERN int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num,
     void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+  TIMESCOPE();
+  return __tgt_target_mapper(nullptr, device_id, host_ptr, arg_num, args_base,
+                             args, arg_sizes, arg_types, nullptr, nullptr);
+}
+
+EXTERN int __tgt_target_nowait(int64_t device_id, void *host_ptr,
+    int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
+    int64_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum,
+    void *noAliasDepList) {
+  TIMESCOPE();
+  if (depNum + noAliasDepNum > 0)
+    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
+
+  return __tgt_target_mapper(nullptr, device_id, host_ptr, arg_num, args_base,
+                             args, arg_sizes, arg_types, nullptr, nullptr);
+}
+
+EXTERN int __tgt_target_mapper(ident_t *loc, int64_t device_id, void *host_ptr,
+                               int32_t arg_num, void **args_base, void **args,
+                               int64_t *arg_sizes, int64_t *arg_types,
+                               map_var_info_t *arg_names, void **arg_mappers) {
+  TIMESCOPE();
   if (IsOffloadDisabled()) return OFFLOAD_FAIL;
   DP("Entering target region with entry point " DPxMOD " and device Id %"
       PRId64 "\n", DPxPTR(host_ptr), device_id);
@@ -231,39 +361,72 @@ EXTERN int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num,
   }
 
   if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
-    DP("Failed to get device %" PRId64 " ready\n", device_id);
-    HandleTargetOutcome(false);
+    REPORT("Failed to get device %" PRId64 " ready\n", device_id);
+    HandleTargetOutcome(false, loc);
     return OFFLOAD_FAIL;
   }
 
+  if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
+    printKernelArguments(loc, device_id, arg_num, arg_sizes, arg_types,
+                         arg_names, "Entering OpenMP kernel");
 #ifdef OMPTARGET_DEBUG
   for (int i=0; i<arg_num; ++i) {
     DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
-        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
-        arg_sizes[i], arg_types[i]);
+       ", Type=0x%" PRIx64 ", Name=%s\n",
+       i, DPxPTR(args_base[i]), DPxPTR(args[i]), arg_sizes[i], arg_types[i],
+       (arg_names) ? getNameFromMapping(arg_names[i]).c_str() : "unknown");
   }
 #endif
 
   int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
-      arg_types, 0, 0, false /*team*/);
-  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
+                  arg_types, arg_names, arg_mappers, 0, 0, false /*team*/);
+  HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
   return rc;
 }
 
-EXTERN int __tgt_target_nowait(int64_t device_id, void *host_ptr,
-    int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
-    int64_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum,
-    void *noAliasDepList) {
+EXTERN int __tgt_target_nowait_mapper(
+    ident_t *loc, int64_t device_id, void *host_ptr, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
+    map_var_info_t *arg_names, void **arg_mappers, int32_t depNum,
+    void *depList, int32_t noAliasDepNum, void *noAliasDepList) {
+  TIMESCOPE();
   if (depNum + noAliasDepNum > 0)
-    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
+    __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc));
 
-  return __tgt_target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
-                      arg_types);
+  return __tgt_target_mapper(loc, device_id, host_ptr, arg_num, args_base, args,
+                             arg_sizes, arg_types, arg_names, arg_mappers);
 }
 
 EXTERN int __tgt_target_teams(int64_t device_id, void *host_ptr,
     int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
     int64_t *arg_types, int32_t team_num, int32_t thread_limit) {
+  TIMESCOPE();
+  return __tgt_target_teams_mapper(nullptr, device_id, host_ptr, arg_num,
+                                   args_base, args, arg_sizes, arg_types,
+                                   nullptr, nullptr, team_num, thread_limit);
+}
+
+EXTERN int __tgt_target_teams_nowait(int64_t device_id, void *host_ptr,
+    int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
+    int64_t *arg_types, int32_t team_num, int32_t thread_limit, int32_t depNum,
+    void *depList, int32_t noAliasDepNum, void *noAliasDepList) {
+  TIMESCOPE();
+  if (depNum + noAliasDepNum > 0)
+    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
+
+  return __tgt_target_teams_mapper(nullptr, device_id, host_ptr, arg_num,
+                                   args_base, args, arg_sizes, arg_types,
+                                   nullptr, nullptr, team_num, thread_limit);
+}
+
+EXTERN int __tgt_target_teams_mapper(ident_t *loc, int64_t device_id,
+                                     void *host_ptr, int32_t arg_num,
+                                     void **args_base, void **args,
+                                     int64_t *arg_sizes, int64_t *arg_types,
+                                     map_var_info_t *arg_names,
+                                     void **arg_mappers, int32_t team_num,
+                                     int32_t thread_limit) {
+  TIMESCOPE();
   if (IsOffloadDisabled()) return OFFLOAD_FAIL;
   DP("Entering target region with entry point " DPxMOD " and device Id %"
       PRId64 "\n", DPxPTR(host_ptr), device_id);
@@ -273,39 +436,48 @@ EXTERN int __tgt_target_teams(int64_t device_id, void *host_ptr,
   }
 
   if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
-    DP("Failed to get device %" PRId64 " ready\n", device_id);
-    HandleTargetOutcome(false);
+    REPORT("Failed to get device %" PRId64 " ready\n", device_id);
+    HandleTargetOutcome(false, loc);
     return OFFLOAD_FAIL;
   }
 
+  if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
+    printKernelArguments(loc, device_id, arg_num, arg_sizes, arg_types,
+                         arg_names, "Entering OpenMP kernel");
 #ifdef OMPTARGET_DEBUG
   for (int i=0; i<arg_num; ++i) {
     DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
-        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
-        arg_sizes[i], arg_types[i]);
+       ", Type=0x%" PRIx64 ", Name=%s\n",
+       i, DPxPTR(args_base[i]), DPxPTR(args[i]), arg_sizes[i], arg_types[i],
+       (arg_names) ? getNameFromMapping(arg_names[i]).c_str() : "unknown");
   }
 #endif
 
   int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
-      arg_types, team_num, thread_limit, true /*team*/);
-  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
-
+                  arg_types, arg_names, arg_mappers, team_num, thread_limit,
+                  true /*team*/);
+  HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
   return rc;
 }
 
-EXTERN int __tgt_target_teams_nowait(int64_t device_id, void *host_ptr,
-    int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
-    int64_t *arg_types, int32_t team_num, int32_t thread_limit, int32_t depNum,
-    void *depList, int32_t noAliasDepNum, void *noAliasDepList) {
+EXTERN int __tgt_target_teams_nowait_mapper(
+    ident_t *loc, int64_t device_id, void *host_ptr, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
+    map_var_info_t *arg_names, void **arg_mappers, int32_t team_num,
+    int32_t thread_limit, int32_t depNum, void *depList, int32_t noAliasDepNum,
+    void *noAliasDepList) {
+  TIMESCOPE();
   if (depNum + noAliasDepNum > 0)
-    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
+    __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc));
 
-  return __tgt_target_teams(device_id, host_ptr, arg_num, args_base, args,
-                            arg_sizes, arg_types, team_num, thread_limit);
+  return __tgt_target_teams_mapper(loc, device_id, host_ptr, arg_num, args_base,
+                                   args, arg_sizes, arg_types, arg_names,
+                                   arg_mappers, team_num, thread_limit);
 }
 
 // Get the current number of components for a user-defined mapper.
 EXTERN int64_t __tgt_mapper_num_components(void *rt_mapper_handle) {
+  TIMESCOPE();
   auto *MapperComponentsPtr = (struct MapperComponentsTy *)rt_mapper_handle;
   int64_t size = MapperComponentsPtr->Components.size();
   DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
@@ -315,19 +487,22 @@ EXTERN int64_t __tgt_mapper_num_components(void *rt_mapper_handle) {
 
 // Push back one component for a user-defined mapper.
 EXTERN void __tgt_push_mapper_component(void *rt_mapper_handle, void *base,
-                                        void *begin, int64_t size,
-                                        int64_t type) {
+                                        void *begin, int64_t size, int64_t type,
+                                        void *name) {
+  TIMESCOPE();
   DP("__tgt_push_mapper_component(Handle=" DPxMOD
      ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
-     ", Type=0x%" PRIx64 ").\n",
-     DPxPTR(rt_mapper_handle), DPxPTR(base), DPxPTR(begin), size, type);
+     ", Type=0x%" PRIx64 ", Name=%s).\n",
+     DPxPTR(rt_mapper_handle), DPxPTR(base), DPxPTR(begin), size, type,
+     (name) ? getNameFromMapping(name).c_str() : "unknown");
   auto *MapperComponentsPtr = (struct MapperComponentsTy *)rt_mapper_handle;
   MapperComponentsPtr->Components.push_back(
-      MapComponentInfoTy(base, begin, size, type));
+      MapComponentInfoTy(base, begin, size, type, name));
 }
 
-EXTERN void __kmpc_push_target_tripcount(int64_t device_id,
-    uint64_t loop_tripcount) {
+EXTERN void __kmpc_push_target_tripcount(ident_t *loc, int64_t device_id,
+                                         uint64_t loop_tripcount) {
+  TIMESCOPE();
   if (IsOffloadDisabled())
     return;
 
@@ -337,14 +512,14 @@ EXTERN void __kmpc_push_target_tripcount(int64_t device_id,
 
   if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
     DP("Failed to get device %" PRId64 " ready\n", device_id);
-    HandleTargetOutcome(false);
+    HandleTargetOutcome(false, loc);
     return;
   }
 
   DP("__kmpc_push_target_tripcount(%" PRId64 ", %" PRIu64 ")\n", device_id,
       loop_tripcount);
-  TblMapMtx.lock();
-  Devices[device_id].LoopTripCnt.emplace(__kmpc_global_thread_num(NULL),
-                                         loop_tripcount);
-  TblMapMtx.unlock();
+  PM->TblMapMtx.lock();
+  PM->Devices[device_id].LoopTripCnt.emplace(__kmpc_global_thread_num(NULL),
+                                             loop_tripcount);
+  PM->TblMapMtx.unlock();
 }
diff --git a/libomptarget/src/omptarget.cpp b/libomptarget/src/omptarget.cpp
index 2feb7c89f..8cb16a489 100644
--- a/libomptarget/src/omptarget.cpp
+++ b/libomptarget/src/omptarget.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <omptarget.h>
-
 #include "device.h"
 #include "private.h"
 #include "rtl.h"
@@ -20,12 +18,6 @@
 #include <cassert>
 #include <vector>
 
-#ifdef OMPTARGET_DEBUG
-int DebugLevel = 0;
-#endif // OMPTARGET_DEBUG
-
-
-
 /* All begin addresses for partially mapped structs must be 8-aligned in order
  * to ensure proper alignment of members. E.g.
  *
@@ -56,7 +48,7 @@ int DebugLevel = 0;
  * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
  * &s1.p=0x208, as they should be to satisfy the alignment requirements.
  */
-static const int64_t alignment = 8;
+static const int64_t Alignment = 8;
 
 /// Map global data and execute pending ctors
 static int InitLibrary(DeviceTy& Device) {
@@ -67,11 +59,16 @@ static int InitLibrary(DeviceTy& Device) {
   int rc = OFFLOAD_SUCCESS;
 
   Device.PendingGlobalsMtx.lock();
-  TrlTblMtx.lock();
-  for (HostEntriesBeginToTransTableTy::iterator
-      ii = HostEntriesBeginToTransTable.begin();
-      ii != HostEntriesBeginToTransTable.end(); ++ii) {
-    TranslationTable *TransTable = &ii->second;
+  PM->TrlTblMtx.lock();
+  for (HostEntriesBeginToTransTableTy::iterator entry_it =
+           PM->HostEntriesBeginToTransTable.begin();
+       entry_it != PM->HostEntriesBeginToTransTable.end(); ++entry_it) {
+    TranslationTable *TransTable = &entry_it->second;
+    if (TransTable->HostTable.EntriesBegin ==
+        TransTable->HostTable.EntriesEnd) {
+      // No host entry so no need to proceed
+      continue;
+    }
     if (TransTable->TargetsTable[device_id] != 0) {
       // Library entries have already been processed
       continue;
@@ -82,7 +79,7 @@ static int InitLibrary(DeviceTy& Device) {
            "Not expecting a device ID outside the table's bounds!");
     __tgt_device_image *img = TransTable->TargetsImages[device_id];
     if (!img) {
-      DP("No image loaded for device id %d.\n", device_id);
+      REPORT("No image loaded for device id %d.\n", device_id);
       rc = OFFLOAD_FAIL;
       break;
     }
@@ -91,7 +88,7 @@ static int InitLibrary(DeviceTy& Device) {
         TransTable->TargetsTable[device_id] = Device.load_binary(img);
     // Unable to get table for this image: invalidate image and fail.
     if (!TargetTable) {
-      DP("Unable to generate entries table for device id %d.\n", device_id);
+      REPORT("Unable to generate entries table for device id %d.\n", device_id);
       TransTable->TargetsImages[device_id] = 0;
       rc = OFFLOAD_FAIL;
       break;
@@ -104,8 +101,8 @@ static int InitLibrary(DeviceTy& Device) {
 
     // Invalid image for these host entries!
     if (hsize != tsize) {
-      DP("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
-         device_id, hsize, tsize);
+      REPORT("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
+             device_id, hsize, tsize);
       TransTable->TargetsImages[device_id] = 0;
       TransTable->TargetsTable[device_id] = 0;
       rc = OFFLOAD_FAIL;
@@ -134,17 +131,17 @@ static int InitLibrary(DeviceTy& Device) {
         DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
             "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
             CurrDeviceEntry->size);
-        Device.HostDataToTargetMap.push_front(HostDataToTargetTy(
+        Device.HostDataToTargetMap.emplace(
             (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
             (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
             (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
-            (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/,
-            INF_REF_CNT /*RefCount*/));
+            (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, nullptr,
+            true /*IsRefCountINF*/);
       }
     }
     Device.DataMapMtx.unlock();
   }
-  TrlTblMtx.unlock();
+  PM->TrlTblMtx.unlock();
 
   if (rc != OFFLOAD_SUCCESS) {
     Device.PendingGlobalsMtx.unlock();
@@ -161,10 +158,10 @@ static int InitLibrary(DeviceTy& Device) {
         DP("Has pending ctors... call now\n");
         for (auto &entry : lib.second.PendingCtors) {
           void *ctor = entry;
-          int rc = target(device_id, ctor, 0, NULL, NULL, NULL,
-                          NULL, 1, 1, true /*team*/);
+          int rc = target(device_id, ctor, 0, nullptr, nullptr, nullptr,
+                          nullptr, nullptr, nullptr, 1, 1, true /*team*/);
           if (rc != OFFLOAD_SUCCESS) {
-            DP("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
+            REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
             Device.PendingGlobalsMtx.unlock();
             return OFFLOAD_FAIL;
           }
@@ -186,32 +183,77 @@ static int InitLibrary(DeviceTy& Device) {
 int CheckDeviceAndCtors(int64_t device_id) {
   // Is device ready?
   if (!device_is_ready(device_id)) {
-    DP("Device %" PRId64 " is not ready.\n", device_id);
+    REPORT("Device %" PRId64 " is not ready.\n", device_id);
     return OFFLOAD_FAIL;
   }
 
   // Get device info.
-  DeviceTy &Device = Devices[device_id];
+  DeviceTy &Device = PM->Devices[device_id];
 
   // Check whether global data has been mapped for this device
   Device.PendingGlobalsMtx.lock();
   bool hasPendingGlobals = Device.HasPendingGlobals;
   Device.PendingGlobalsMtx.unlock();
   if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
-    DP("Failed to init globals on device %" PRId64 "\n", device_id);
+    REPORT("Failed to init globals on device %" PRId64 "\n", device_id);
     return OFFLOAD_FAIL;
   }
 
   return OFFLOAD_SUCCESS;
 }
 
-static int32_t member_of(int64_t type) {
+static int32_t getParentIndex(int64_t type) {
   return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
 }
 
+/// Call the user-defined mapper function followed by the appropriate
+// target_data_* function (target_data_{begin,end,update}).
+int targetDataMapper(DeviceTy &Device, void *arg_base, void *arg,
+                     int64_t arg_size, int64_t arg_type,
+                     map_var_info_t arg_names, void *arg_mapper,
+                     TargetDataFuncPtrTy target_data_function) {
+  DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper));
+
+  // The mapper function fills up Components.
+  MapperComponentsTy MapperComponents;
+  MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(arg_mapper);
+  (*MapperFuncPtr)((void *)&MapperComponents, arg_base, arg, arg_size, arg_type,
+                   arg_names);
+
+  // Construct new arrays for args_base, args, arg_sizes and arg_types
+  // using the information in MapperComponents and call the corresponding
+  // target_data_* function using these new arrays.
+  std::vector<void *> MapperArgsBase(MapperComponents.Components.size());
+  std::vector<void *> MapperArgs(MapperComponents.Components.size());
+  std::vector<int64_t> MapperArgSizes(MapperComponents.Components.size());
+  std::vector<int64_t> MapperArgTypes(MapperComponents.Components.size());
+  std::vector<void *> MapperArgNames(MapperComponents.Components.size());
+
+  for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
+    auto &C =
+        MapperComponents
+            .Components[target_data_function == targetDataEnd ? I : E - I - 1];
+    MapperArgsBase[I] = C.Base;
+    MapperArgs[I] = C.Begin;
+    MapperArgSizes[I] = C.Size;
+    MapperArgTypes[I] = C.Type;
+    MapperArgNames[I] = C.Name;
+  }
+
+  int rc = target_data_function(Device, MapperComponents.Components.size(),
+                                MapperArgsBase.data(), MapperArgs.data(),
+                                MapperArgSizes.data(), MapperArgTypes.data(),
+                                MapperArgNames.data(), /*arg_mappers*/ nullptr,
+                                /*__tgt_async_info*/ nullptr);
+
+  return rc;
+}
+
 /// Internal function to do the mapping and transfer the data to the device
-int target_data_begin(DeviceTy &Device, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+int targetDataBegin(DeviceTy &Device, int32_t arg_num, void **args_base,
+                    void **args, int64_t *arg_sizes, int64_t *arg_types,
+                    map_var_info_t *arg_names, void **arg_mappers,
+                    __tgt_async_info *async_info_ptr) {
   // process each input.
   for (int32_t i = 0; i < arg_num; ++i) {
     // Ignore private variables and arrays - there is no mapping for them.
@@ -219,18 +261,40 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
         (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
       continue;
 
+    if (arg_mappers && arg_mappers[i]) {
+      // Instead of executing the regular path of targetDataBegin, call the
+      // targetDataMapper variant which will call targetDataBegin again
+      // with new arguments.
+      DP("Calling targetDataMapper for the %dth argument\n", i);
+
+      map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i];
+      int rc = targetDataMapper(Device, args_base[i], args[i], arg_sizes[i],
+                                arg_types[i], arg_name, arg_mappers[i],
+                                targetDataBegin);
+
+      if (rc != OFFLOAD_SUCCESS) {
+        REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
+               " failed.\n");
+        return OFFLOAD_FAIL;
+      }
+
+      // Skip the rest of this function, continue to the next argument.
+      continue;
+    }
+
     void *HstPtrBegin = args[i];
     void *HstPtrBase = args_base[i];
     int64_t data_size = arg_sizes[i];
+    map_var_info_t HstPtrName = (!arg_names) ? nullptr : arg_names[i];
 
     // Adjust for proper alignment if this is a combined entry (for structs).
     // Look at the next argument - if that is MEMBER_OF this one, then this one
     // is a combined entry.
     int64_t padding = 0;
     const int next_i = i+1;
-    if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
-        member_of(arg_types[next_i]) == i) {
-      padding = (int64_t)HstPtrBegin % alignment;
+    if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num &&
+        getParentIndex(arg_types[next_i]) == i) {
+      padding = (int64_t)HstPtrBegin % Alignment;
       if (padding) {
         DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
             "\n", padding, DPxPTR(HstPtrBegin));
@@ -240,13 +304,14 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
     }
 
     // Address of pointer on the host and device, respectively.
-    void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin;
+    void *Pointer_HstPtrBegin, *PointerTgtPtrBegin;
     bool IsNew, Pointer_IsNew;
     bool IsHostPtr = false;
     bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
     // Force the creation of a device side copy of the data when:
     // a close map modifier was associated with a map that contained a to.
     bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
+    bool HasPresentModifier = arg_types[i] & OMP_TGT_MAPTYPE_PRESENT;
     // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
     // have reached this point via __tgt_target_data_begin and not __tgt_target
     // then no argument is marked as TARGET_PARAM ("omp target data map" is not
@@ -255,17 +320,31 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
     bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF);
     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
       DP("Has a pointer entry: \n");
-      // base is address of pointer.
-      Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase,
-          sizeof(void *), Pointer_IsNew, IsHostPtr, IsImplicit, UpdateRef,
-          HasCloseModifier);
-      if (!Pointer_TgtPtrBegin) {
-        DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
-            "illegal mapping).\n");
+      // Base is address of pointer.
+      //
+      // Usually, the pointer is already allocated by this time.  For example:
+      //
+      //   #pragma omp target map(s.p[0:N])
+      //
+      // The map entry for s comes first, and the PTR_AND_OBJ entry comes
+      // afterward, so the pointer is already allocated by the time the
+      // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
+      // non-null.  However, "declare target link" can produce a PTR_AND_OBJ
+      // entry for a global that might not already be allocated by the time the
+      // PTR_AND_OBJ entry is handled below, and so the allocation might fail
+      // when HasPresentModifier.
+      PointerTgtPtrBegin = Device.getOrAllocTgtPtr(
+          HstPtrBase, HstPtrBase, sizeof(void *), nullptr, Pointer_IsNew,
+          IsHostPtr, IsImplicit, UpdateRef, HasCloseModifier,
+          HasPresentModifier);
+      if (!PointerTgtPtrBegin) {
+        REPORT("Call to getOrAllocTgtPtr returned null pointer (%s).\n",
+               HasPresentModifier ? "'present' map type modifier"
+                                  : "device failure or illegal mapping");
         return OFFLOAD_FAIL;
       }
       DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
-          "\n", sizeof(void *), DPxPTR(Pointer_TgtPtrBegin),
+          "\n", sizeof(void *), DPxPTR(PointerTgtPtrBegin),
           (Pointer_IsNew ? "" : " not"));
       Pointer_HstPtrBegin = HstPtrBase;
       // modify current entry.
@@ -273,13 +352,16 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
       UpdateRef = true; // subsequently update ref count of pointee
     }
 
-    void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase,
-        data_size, IsNew, IsHostPtr, IsImplicit, UpdateRef, HasCloseModifier);
-    if (!TgtPtrBegin && data_size) {
-      // If data_size==0, then the argument could be a zero-length pointer to
-      // NULL, so getOrAlloc() returning NULL is not an error.
-      DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
-          "illegal mapping).\n");
+    void *TgtPtrBegin = Device.getOrAllocTgtPtr(
+        HstPtrBegin, HstPtrBase, data_size, HstPtrName, IsNew, IsHostPtr,
+        IsImplicit, UpdateRef, HasCloseModifier, HasPresentModifier);
+    // If data_size==0, then the argument could be a zero-length pointer to
+    // NULL, so getOrAlloc() returning NULL is not an error.
+    if (!TgtPtrBegin && (data_size || HasPresentModifier)) {
+      REPORT("Call to getOrAllocTgtPtr returned null pointer (%s).\n",
+             HasPresentModifier ? "'present' map type modifier"
+                                : "device failure or illegal mapping");
+      return OFFLOAD_FAIL;
     }
     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
         " - is%s new\n", data_size, DPxPTR(TgtPtrBegin),
@@ -294,14 +376,17 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
 
     if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
       bool copy = false;
-      if (!(RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
+      if (!(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
           HasCloseModifier) {
         if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
           copy = true;
-        } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
+        } else if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
+                   !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
           // Copy data only if the "parent" struct has RefCount==1.
-          int32_t parent_idx = member_of(arg_types[i]);
-          long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
+          // If this is a PTR_AND_OBJ entry, the OBJ is not part of the struct,
+          // so exclude it from this check.
+          int32_t parent_idx = getParentIndex(arg_types[i]);
+          uint64_t parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
           assert(parent_rc > 0 && "parent struct not found");
           if (parent_rc == 1) {
             copy = true;
@@ -311,10 +396,11 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
 
       if (copy && !IsHostPtr) {
         DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
-            data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
-        int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size);
+           data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
+        int rt = Device.submitData(TgtPtrBegin, HstPtrBegin, data_size,
+                                   async_info_ptr);
         if (rt != OFFLOAD_SUCCESS) {
-          DP("Copying data to device failed.\n");
+          REPORT("Copying data to device failed.\n");
           return OFFLOAD_FAIL;
         }
       }
@@ -322,19 +408,19 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
 
     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
       DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
-          DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
+         DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
       uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
       void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
-      int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase,
-          sizeof(void *));
+      int rt = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase,
+                                 sizeof(void *), async_info_ptr);
       if (rt != OFFLOAD_SUCCESS) {
-        DP("Copying data to device failed.\n");
+        REPORT("Copying data to device failed.\n");
         return OFFLOAD_FAIL;
       }
       // create shadow pointers for this entry
       Device.ShadowMtx.lock();
-      Device.ShadowPtrMap[Pointer_HstPtrBegin] = {HstPtrBase,
-          Pointer_TgtPtrBegin, TgtPtrBase};
+      Device.ShadowPtrMap[Pointer_HstPtrBegin] = {
+          HstPtrBase, PointerTgtPtrBegin, TgtPtrBase};
       Device.ShadowMtx.unlock();
     }
   }
@@ -342,82 +428,148 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
   return OFFLOAD_SUCCESS;
 }
 
+namespace {
+/// This structure contains information to deallocate a target pointer, aka.
+/// used to call the function \p DeviceTy::deallocTgtPtr.
+struct DeallocTgtPtrInfo {
+  /// Host pointer used to look up into the map table
+  void *HstPtrBegin;
+  /// Size of the data
+  int64_t DataSize;
+  /// Whether it is forced to be removed from the map table
+  bool ForceDelete;
+  /// Whether it has \p close modifier
+  bool HasCloseModifier;
+
+  DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool ForceDelete,
+                    bool HasCloseModifier)
+      : HstPtrBegin(HstPtr), DataSize(Size), ForceDelete(ForceDelete),
+        HasCloseModifier(HasCloseModifier) {}
+};
+} // namespace
+
 /// Internal function to undo the mapping and retrieve the data from the device.
-int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
-    void **args, int64_t *arg_sizes, int64_t *arg_types) {
+int targetDataEnd(DeviceTy &Device, int32_t ArgNum, void **ArgBases,
+                  void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
+                  map_var_info_t *ArgNames, void **ArgMappers,
+                  __tgt_async_info *AsyncInfo) {
+  int Ret;
+  std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs;
   // process each input.
-  for (int32_t i = arg_num - 1; i >= 0; --i) {
+  for (int32_t I = ArgNum - 1; I >= 0; --I) {
     // Ignore private variables and arrays - there is no mapping for them.
     // Also, ignore the use_device_ptr directive, it has no effect here.
-    if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
-        (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
+    if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
+        (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
       continue;
 
-    void *HstPtrBegin = args[i];
-    int64_t data_size = arg_sizes[i];
+    if (ArgMappers && ArgMappers[I]) {
+      // Instead of executing the regular path of targetDataEnd, call the
+      // targetDataMapper variant which will call targetDataEnd again
+      // with new arguments.
+      DP("Calling targetDataMapper for the %dth argument\n", I);
+
+      map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
+      Ret =
+          targetDataMapper(Device, ArgBases[I], Args[I], ArgSizes[I],
+                           ArgTypes[I], ArgName, ArgMappers[I], targetDataEnd);
+
+      if (Ret != OFFLOAD_SUCCESS) {
+        REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
+               " failed.\n");
+        return OFFLOAD_FAIL;
+      }
+
+      // Skip the rest of this function, continue to the next argument.
+      continue;
+    }
+
+    void *HstPtrBegin = Args[I];
+    int64_t DataSize = ArgSizes[I];
     // Adjust for proper alignment if this is a combined entry (for structs).
     // Look at the next argument - if that is MEMBER_OF this one, then this one
     // is a combined entry.
-    int64_t padding = 0;
-    const int next_i = i+1;
-    if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
-        member_of(arg_types[next_i]) == i) {
-      padding = (int64_t)HstPtrBegin % alignment;
-      if (padding) {
-        DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
-            "\n", padding, DPxPTR(HstPtrBegin));
-        HstPtrBegin = (char *) HstPtrBegin - padding;
-        data_size += padding;
+    const int NextI = I + 1;
+    if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum &&
+        getParentIndex(ArgTypes[NextI]) == I) {
+      int64_t Padding = (int64_t)HstPtrBegin % Alignment;
+      if (Padding) {
+        DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD
+           "\n",
+           Padding, DPxPTR(HstPtrBegin));
+        HstPtrBegin = (char *)HstPtrBegin - Padding;
+        DataSize += Padding;
       }
     }
 
     bool IsLast, IsHostPtr;
-    bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
-        (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
-    bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE;
-    bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
+    bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
+    bool UpdateRef = !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
+                     (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
+    bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
+    bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE;
+    bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
 
     // If PTR_AND_OBJ, HstPtrBegin is address of pointee
-    void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast,
-        UpdateRef, IsHostPtr);
-    DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
-        " - is%s last\n", data_size, DPxPTR(TgtPtrBegin),
-        (IsLast ? "" : " not"));
+    void *TgtPtrBegin = Device.getTgtPtrBegin(
+        HstPtrBegin, DataSize, IsLast, UpdateRef, IsHostPtr, !IsImplicit);
+    if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
+      DP("Mapping does not exist (%s)\n",
+         (HasPresentModifier ? "'present' map type modifier" : "ignored"));
+      if (HasPresentModifier) {
+        // This should be an error upon entering an "omp target exit data".  It
+        // should not be an error upon exiting an "omp target data" or "omp
+        // target".  For "omp target data", Clang thus doesn't include present
+        // modifiers for end calls.  For "omp target", we have not found a valid
+        // OpenMP program for which the error matters: it appears that, if a
+        // program can guarantee that data is present at the beginning of an
+        // "omp target" region so that there's no error there, that data is also
+        // guaranteed to be present at the end.
+        MESSAGE("device mapping required by 'present' map type modifier does "
+                "not exist for host address " DPxMOD " (%" PRId64 " bytes)",
+                DPxPTR(HstPtrBegin), DataSize);
+        return OFFLOAD_FAIL;
+      }
+    } else {
+      DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
+         " - is%s last\n",
+         DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not"));
+    }
 
     bool DelEntry = IsLast || ForceDelete;
 
-    if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
-        !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
+    if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
+        !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
       DelEntry = false; // protect parent struct from being deallocated
     }
 
-    if ((arg_types[i] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
+    if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
       // Move data back to the host
-      if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
-        bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
+      if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
+        bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
         bool CopyMember = false;
-        if (!(RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
+        if (!(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
             HasCloseModifier) {
-          if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
-              !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
+          if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
+              !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
             // Copy data only if the "parent" struct has RefCount==1.
-            int32_t parent_idx = member_of(arg_types[i]);
-            long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
-            assert(parent_rc > 0 && "parent struct not found");
-            if (parent_rc == 1) {
+            int32_t ParentIdx = getParentIndex(ArgTypes[I]);
+            uint64_t ParentRC = Device.getMapEntryRefCnt(Args[ParentIdx]);
+            assert(ParentRC > 0 && "parent struct not found");
+            if (ParentRC == 1)
               CopyMember = true;
-            }
           }
         }
 
         if ((DelEntry || Always || CopyMember) &&
-            !(RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+            !(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
               TgtPtrBegin == HstPtrBegin)) {
           DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
-              data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-          int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size);
-          if (rt != OFFLOAD_SUCCESS) {
-            DP("Copying data from device failed.\n");
+             DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
+          Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize,
+                                    AsyncInfo);
+          if (Ret != OFFLOAD_SUCCESS) {
+            REPORT("Copying data from device failed.\n");
             return OFFLOAD_FAIL;
           }
         }
@@ -427,139 +579,251 @@ int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
       // need to restore the original host pointer values from their shadow
       // copies. If the struct is going to be deallocated, remove any remaining
       // shadow pointer entries for this struct.
-      uintptr_t lb = (uintptr_t) HstPtrBegin;
-      uintptr_t ub = (uintptr_t) HstPtrBegin + data_size;
+      uintptr_t LB = (uintptr_t)HstPtrBegin;
+      uintptr_t UB = (uintptr_t)HstPtrBegin + DataSize;
       Device.ShadowMtx.lock();
-      for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
-           it != Device.ShadowPtrMap.end();) {
-        void **ShadowHstPtrAddr = (void**) it->first;
+      for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin();
+           Itr != Device.ShadowPtrMap.end();) {
+        void **ShadowHstPtrAddr = (void **)Itr->first;
 
         // An STL map is sorted on its keys; use this property
         // to quickly determine when to break out of the loop.
-        if ((uintptr_t) ShadowHstPtrAddr < lb) {
-          ++it;
+        if ((uintptr_t)ShadowHstPtrAddr < LB) {
+          ++Itr;
           continue;
         }
-        if ((uintptr_t) ShadowHstPtrAddr >= ub)
+        if ((uintptr_t)ShadowHstPtrAddr >= UB)
           break;
 
         // If we copied the struct to the host, we need to restore the pointer.
-        if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
+        if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
           DP("Restoring original host pointer value " DPxMOD " for host "
-              "pointer " DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
-              DPxPTR(ShadowHstPtrAddr));
-          *ShadowHstPtrAddr = it->second.HstPtrVal;
+             "pointer " DPxMOD "\n",
+             DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
+          *ShadowHstPtrAddr = Itr->second.HstPtrVal;
         }
         // If the struct is to be deallocated, remove the shadow entry.
         if (DelEntry) {
           DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
-          it = Device.ShadowPtrMap.erase(it);
+          Itr = Device.ShadowPtrMap.erase(Itr);
         } else {
-          ++it;
+          ++Itr;
         }
       }
       Device.ShadowMtx.unlock();
 
-      // Deallocate map
-      if (DelEntry) {
-        int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete,
-                                      HasCloseModifier);
-        if (rt != OFFLOAD_SUCCESS) {
-          DP("Deallocating data from device failed.\n");
-          return OFFLOAD_FAIL;
-        }
-      }
+      // Add pointer to the buffer for later deallocation
+      if (DelEntry)
+        DeallocTgtPtrs.emplace_back(HstPtrBegin, DataSize, ForceDelete,
+                                    HasCloseModifier);
+    }
+  }
+
+  // We need to synchronize before deallocating data.
+  // If AsyncInfo is nullptr, the previous data transfer (if has) will be
+  // synchronous, so we don't need to synchronize again. If AsyncInfo->Queue is
+  // nullptr, there is no data transfer happened because once there is,
+  // AsyncInfo->Queue will not be nullptr, so again, we don't need to
+  // synchronize.
+  if (AsyncInfo && AsyncInfo->Queue) {
+    Ret = Device.synchronize(AsyncInfo);
+    if (Ret != OFFLOAD_SUCCESS) {
+      REPORT("Failed to synchronize device.\n");
+      return OFFLOAD_FAIL;
+    }
+  }
+
+  // Deallocate target pointer
+  for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) {
+    Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize,
+                               Info.ForceDelete, Info.HasCloseModifier);
+    if (Ret != OFFLOAD_SUCCESS) {
+      REPORT("Deallocating data from device failed.\n");
+      return OFFLOAD_FAIL;
     }
   }
 
   return OFFLOAD_SUCCESS;
 }
 
-/// Internal function to pass data to/from the target.
-int target_data_update(DeviceTy &Device, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
-  // process each input.
-  for (int32_t i = 0; i < arg_num; ++i) {
-    if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
-        (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
-      continue;
+static int targetDataContiguous(DeviceTy &Device, void *ArgsBase,
+                                void *HstPtrBegin, int64_t ArgSize,
+                                int64_t ArgType) {
+  bool IsLast, IsHostPtr;
+  void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSize, IsLast, false,
+                                            IsHostPtr, /*MustContain=*/true);
+  if (!TgtPtrBegin) {
+    DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
+    if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
+      MESSAGE("device mapping required by 'present' motion modifier does not "
+              "exist for host address " DPxMOD " (%" PRId64 " bytes)",
+              DPxPTR(HstPtrBegin), ArgSize);
+      return OFFLOAD_FAIL;
+    }
+    return OFFLOAD_SUCCESS;
+  }
 
-    void *HstPtrBegin = args[i];
-    int64_t MapSize = arg_sizes[i];
-    bool IsLast, IsHostPtr;
-    void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast,
-        false, IsHostPtr);
-    if (!TgtPtrBegin) {
-      DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
-      continue;
+  if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+      TgtPtrBegin == HstPtrBegin) {
+    DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
+       DPxPTR(HstPtrBegin));
+    return OFFLOAD_SUCCESS;
+  }
+
+  if (ArgType & OMP_TGT_MAPTYPE_FROM) {
+    DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
+       ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
+    int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, nullptr);
+    if (Ret != OFFLOAD_SUCCESS) {
+      REPORT("Copying data from device failed.\n");
+      return OFFLOAD_FAIL;
     }
 
-    if (RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
-        TgtPtrBegin == HstPtrBegin) {
-      DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
-         DPxPTR(HstPtrBegin));
-      continue;
+    uintptr_t LB = (uintptr_t)HstPtrBegin;
+    uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
+    Device.ShadowMtx.lock();
+    for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
+         IT != Device.ShadowPtrMap.end(); ++IT) {
+      void **ShadowHstPtrAddr = (void **)IT->first;
+      if ((uintptr_t)ShadowHstPtrAddr < LB)
+        continue;
+      if ((uintptr_t)ShadowHstPtrAddr >= UB)
+        break;
+      DP("Restoring original host pointer value " DPxMOD
+         " for host pointer " DPxMOD "\n",
+         DPxPTR(IT->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
+      *ShadowHstPtrAddr = IT->second.HstPtrVal;
     }
+    Device.ShadowMtx.unlock();
+  }
 
-    if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
-      DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
-          arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-      int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize);
-      if (rt != OFFLOAD_SUCCESS) {
-        DP("Copying data from device failed.\n");
+  if (ArgType & OMP_TGT_MAPTYPE_TO) {
+    DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
+       ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
+    int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, nullptr);
+    if (Ret != OFFLOAD_SUCCESS) {
+      REPORT("Copying data to device failed.\n");
+      return OFFLOAD_FAIL;
+    }
+
+    uintptr_t LB = (uintptr_t)HstPtrBegin;
+    uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
+    Device.ShadowMtx.lock();
+    for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
+         IT != Device.ShadowPtrMap.end(); ++IT) {
+      void **ShadowHstPtrAddr = (void **)IT->first;
+      if ((uintptr_t)ShadowHstPtrAddr < LB)
+        continue;
+      if ((uintptr_t)ShadowHstPtrAddr >= UB)
+        break;
+      DP("Restoring original target pointer value " DPxMOD " for target "
+         "pointer " DPxMOD "\n",
+         DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr));
+      Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal,
+                              sizeof(void *), nullptr);
+      if (Ret != OFFLOAD_SUCCESS) {
+        REPORT("Copying data to device failed.\n");
+        Device.ShadowMtx.unlock();
         return OFFLOAD_FAIL;
       }
+    }
+    Device.ShadowMtx.unlock();
+  }
+  return OFFLOAD_SUCCESS;
+}
 
-      uintptr_t lb = (uintptr_t) HstPtrBegin;
-      uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
-      Device.ShadowMtx.lock();
-      for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
-          it != Device.ShadowPtrMap.end(); ++it) {
-        void **ShadowHstPtrAddr = (void**) it->first;
-        if ((uintptr_t) ShadowHstPtrAddr < lb)
-          continue;
-        if ((uintptr_t) ShadowHstPtrAddr >= ub)
-          break;
-        DP("Restoring original host pointer value " DPxMOD " for host pointer "
-            DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
-            DPxPTR(ShadowHstPtrAddr));
-        *ShadowHstPtrAddr = it->second.HstPtrVal;
+static int targetDataNonContiguous(DeviceTy &Device, void *ArgsBase,
+                                   __tgt_target_non_contig *NonContig,
+                                   uint64_t Size, int64_t ArgType,
+                                   int CurrentDim, int DimSize,
+                                   uint64_t Offset) {
+  int Ret = OFFLOAD_SUCCESS;
+  if (CurrentDim < DimSize) {
+    for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
+      uint64_t CurOffset =
+          (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
+      // we only need to transfer the first element for the last dimension
+      // since we've already got a contiguous piece.
+      if (CurrentDim != DimSize - 1 || I == 0) {
+        Ret = targetDataNonContiguous(Device, ArgsBase, NonContig, Size,
+                                      ArgType, CurrentDim + 1, DimSize,
+                                      Offset + CurOffset);
+        // Stop the whole process if any contiguous piece returns anything
+        // other than OFFLOAD_SUCCESS.
+        if (Ret != OFFLOAD_SUCCESS)
+          return Ret;
       }
-      Device.ShadowMtx.unlock();
     }
+  } else {
+    char *Ptr = (char *)ArgsBase + Offset;
+    DP("Transfer of non-contiguous : host ptr %lx offset %ld len %ld\n",
+       (uint64_t)Ptr, Offset, Size);
+    Ret = targetDataContiguous(Device, ArgsBase, Ptr, Size, ArgType);
+  }
+  return Ret;
+}
 
-    if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
-      DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
-          arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
-      int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize);
-      if (rt != OFFLOAD_SUCCESS) {
-        DP("Copying data to device failed.\n");
+static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
+                                       int32_t DimSize) {
+  int RemovedDim = 0;
+  for (int I = DimSize - 1; I > 0; --I) {
+    if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
+      RemovedDim++;
+  }
+  return RemovedDim;
+}
+
+/// Internal function to pass data to/from the target.
+// async_info_ptr is currently unused, added here so targetDataUpdate has the
+// same signature as targetDataBegin and targetDataEnd.
+int targetDataUpdate(DeviceTy &Device, int32_t ArgNum, void **ArgsBase,
+                     void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
+                     map_var_info_t *ArgNames, void **ArgMappers,
+                     __tgt_async_info *AsyncInfoPtr) {
+  // process each input.
+  for (int32_t I = 0; I < ArgNum; ++I) {
+    if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
+        (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
+      continue;
+
+    if (ArgMappers && ArgMappers[I]) {
+      // Instead of executing the regular path of targetDataUpdate, call the
+      // targetDataMapper variant which will call targetDataUpdate again
+      // with new arguments.
+      DP("Calling targetDataMapper for the %dth argument\n", I);
+
+      map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
+      int Ret = targetDataMapper(Device, ArgsBase[I], Args[I], ArgSizes[I],
+                                 ArgTypes[I], ArgName, ArgMappers[I],
+                                 targetDataUpdate);
+
+      if (Ret != OFFLOAD_SUCCESS) {
+        REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
+               " failed.\n");
         return OFFLOAD_FAIL;
       }
 
-      uintptr_t lb = (uintptr_t) HstPtrBegin;
-      uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
-      Device.ShadowMtx.lock();
-      for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
-          it != Device.ShadowPtrMap.end(); ++it) {
-        void **ShadowHstPtrAddr = (void**) it->first;
-        if ((uintptr_t) ShadowHstPtrAddr < lb)
-          continue;
-        if ((uintptr_t) ShadowHstPtrAddr >= ub)
-          break;
-        DP("Restoring original target pointer value " DPxMOD " for target "
-            "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal),
-            DPxPTR(it->second.TgtPtrAddr));
-        rt = Device.data_submit(it->second.TgtPtrAddr,
-            &it->second.TgtPtrVal, sizeof(void *));
-        if (rt != OFFLOAD_SUCCESS) {
-          DP("Copying data to device failed.\n");
-          Device.ShadowMtx.unlock();
-          return OFFLOAD_FAIL;
-        }
-      }
-      Device.ShadowMtx.unlock();
+      // Skip the rest of this function, continue to the next argument.
+      continue;
     }
+
+    int Ret = OFFLOAD_SUCCESS;
+
+    if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
+      __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
+      int32_t DimSize = ArgSizes[I];
+      uint64_t Size =
+          NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
+      int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
+      Ret = targetDataNonContiguous(
+          Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
+          /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0);
+    } else {
+      Ret = targetDataContiguous(Device, ArgsBase[I], Args[I], ArgSizes[I],
+                                 ArgTypes[I]);
+    }
+    if (Ret == OFFLOAD_FAIL)
+      return OFFLOAD_FAIL;
   }
   return OFFLOAD_SUCCESS;
 }
@@ -571,235 +835,456 @@ static bool isLambdaMapping(int64_t Mapping) {
   return (Mapping & LambdaMapping) == LambdaMapping;
 }
 
-/// performs the same actions as data_begin in case arg_num is
-/// non-zero and initiates run of the offloaded region on the target platform;
-/// if arg_num is non-zero after the region execution is done it also
-/// performs the same action as data_update and data_end above. This function
-/// returns 0 if it was able to transfer the execution to a target and an
-/// integer different from zero otherwise.
-int target(int64_t device_id, void *host_ptr, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
-    int32_t team_num, int32_t thread_limit, int IsTeamConstruct) {
-  DeviceTy &Device = Devices[device_id];
-
-  // Find the table information in the map or look it up in the translation
-  // tables.
-  TableMap *TM = 0;
-  TblMapMtx.lock();
-  HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap.find(host_ptr);
-  if (TableMapIt == HostPtrToTableMap.end()) {
-    // We don't have a map. So search all the registered libraries.
-    TrlTblMtx.lock();
-    for (HostEntriesBeginToTransTableTy::iterator
-             ii = HostEntriesBeginToTransTable.begin(),
-             ie = HostEntriesBeginToTransTable.end();
-         !TM && ii != ie; ++ii) {
-      // get the translation table (which contains all the good info).
-      TranslationTable *TransTable = &ii->second;
-      // iterate over all the host table entries to see if we can locate the
-      // host_ptr.
-      __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin;
-      __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd;
-      __tgt_offload_entry *cur = begin;
-      for (uint32_t i = 0; cur < end; ++cur, ++i) {
-        if (cur->addr != host_ptr)
-          continue;
-        // we got a match, now fill the HostPtrToTableMap so that we
-        // may avoid this search next time.
-        TM = &HostPtrToTableMap[host_ptr];
-        TM->Table = TransTable;
-        TM->Index = i;
-        break;
+namespace {
+/// Find the table information in the map or look it up in the translation
+/// tables.
+TableMap *getTableMap(void *HostPtr) {
+  std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
+  HostPtrToTableMapTy::iterator TableMapIt =
+      PM->HostPtrToTableMap.find(HostPtr);
+
+  if (TableMapIt != PM->HostPtrToTableMap.end())
+    return &TableMapIt->second;
+
+  // We don't have a map. So search all the registered libraries.
+  TableMap *TM = nullptr;
+  std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
+  for (HostEntriesBeginToTransTableTy::iterator Itr =
+           PM->HostEntriesBeginToTransTable.begin();
+       Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
+    // get the translation table (which contains all the good info).
+    TranslationTable *TransTable = &Itr->second;
+    // iterate over all the host table entries to see if we can locate the
+    // host_ptr.
+    __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin;
+    for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
+      if (Cur->addr != HostPtr)
+        continue;
+      // we got a match, now fill the HostPtrToTableMap so that we
+      // may avoid this search next time.
+      TM = &(PM->HostPtrToTableMap)[HostPtr];
+      TM->Table = TransTable;
+      TM->Index = I;
+      return TM;
+    }
+  }
+
+  return nullptr;
+}
+
+/// Get loop trip count
+/// FIXME: This function will not work right if calling
+/// __kmpc_push_target_tripcount in one thread but doing offloading in another
+/// thread, which might occur when we call task yield.
+uint64_t getLoopTripCount(int64_t DeviceId) {
+  DeviceTy &Device = PM->Devices[DeviceId];
+  uint64_t LoopTripCount = 0;
+
+  {
+    std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
+    auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
+    if (I != Device.LoopTripCnt.end()) {
+      LoopTripCount = I->second;
+      Device.LoopTripCnt.erase(I);
+      DP("loop trip count is %lu.\n", LoopTripCount);
+    }
+  }
+
+  return LoopTripCount;
+}
+
+/// A class manages private arguments in a target region.
+class PrivateArgumentManagerTy {
+  /// A data structure for the information of first-private arguments. We can
+  /// use this information to optimize data transfer by packing all
+  /// first-private arguments and transfer them all at once.
+  struct FirstPrivateArgInfoTy {
+    /// The index of the element in \p TgtArgs corresponding to the argument
+    const int Index;
+    /// Host pointer begin
+    const char *HstPtrBegin;
+    /// Host pointer end
+    const char *HstPtrEnd;
+    /// Aligned size
+    const int64_t AlignedSize;
+    /// Host pointer name
+    const map_var_info_t HstPtrName = nullptr;
+
+    FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size,
+                          const map_var_info_t HstPtrName = nullptr)
+        : Index(Index), HstPtrBegin(reinterpret_cast<const char *>(HstPtr)),
+          HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment),
+          HstPtrName(HstPtrName) {}
+  };
+
+  /// A vector of target pointers for all private arguments
+  std::vector<void *> TgtPtrs;
+
+  /// A vector of information of all first-private arguments to be packed
+  std::vector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
+  /// Host buffer for all arguments to be packed
+  std::vector<char> FirstPrivateArgBuffer;
+  /// The total size of all arguments to be packed
+  int64_t FirstPrivateArgSize = 0;
+
+  /// A reference to the \p DeviceTy object
+  DeviceTy &Device;
+  /// A pointer to a \p __tgt_async_info object
+  __tgt_async_info *AsyncInfo;
+
+  // TODO: What would be the best value here? Should we make it configurable?
+  // If the size is larger than this threshold, we will allocate and transfer it
+  // immediately instead of packing it.
+  static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
+
+public:
+  /// Constructor
+  PrivateArgumentManagerTy(DeviceTy &Dev, __tgt_async_info *AsyncInfo)
+      : Device(Dev), AsyncInfo(AsyncInfo) {}
+
+  /// Add a private argument
+  int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
+             bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
+             const map_var_info_t HstPtrName = nullptr) {
+    // If the argument is not first-private, or its size is greater than a
+    // predefined threshold, we will allocate memory and issue the transfer
+    // immediately.
+    if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate) {
+      TgtPtr = Device.allocData(ArgSize, HstPtr);
+      if (!TgtPtr) {
+        DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
+           (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
+        return OFFLOAD_FAIL;
       }
+#ifdef OMPTARGET_DEBUG
+      void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
+      DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
+         " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
+         "\n",
+         ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
+         DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
+#endif
+      // If first-private, copy data from host
+      if (IsFirstPrivate) {
+        int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
+        if (Ret != OFFLOAD_SUCCESS) {
+          DP("Copying data to device failed, failed.\n");
+          return OFFLOAD_FAIL;
+        }
+      }
+      TgtPtrs.push_back(TgtPtr);
+    } else {
+      DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
+         DPxPTR(HstPtr), ArgSize);
+      // When reach this point, the argument must meet all following
+      // requirements:
+      // 1. Its size does not exceed the threshold (see the comment for
+      // FirstPrivateArgSizeThreshold);
+      // 2. It must be first-private (needs to be mapped to target device).
+      // We will pack all this kind of arguments to transfer them all at once
+      // to reduce the number of data transfer. We will not take
+      // non-first-private arguments, aka. private arguments that doesn't need
+      // to be mapped to target device, into account because data allocation
+      // can be very efficient with memory manager.
+
+      // Placeholder value
+      TgtPtr = nullptr;
+      FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
+                                       HstPtrName);
+      FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize;
     }
-    TrlTblMtx.unlock();
-  } else {
-    TM = &TableMapIt->second;
+
+    return OFFLOAD_SUCCESS;
   }
-  TblMapMtx.unlock();
 
-  // No map for this host pointer found!
-  if (!TM) {
-    DP("Host ptr " DPxMOD " does not have a matching target pointer.\n",
-       DPxPTR(host_ptr));
-    return OFFLOAD_FAIL;
+  /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
+  /// and start the transfer.
+  int packAndTransfer(std::vector<void *> &TgtArgs) {
+    if (!FirstPrivateArgInfo.empty()) {
+      assert(FirstPrivateArgSize != 0 &&
+             "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
+      FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0);
+      auto Itr = FirstPrivateArgBuffer.begin();
+      // Copy all host data to this buffer
+      for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
+        std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
+        Itr = std::next(Itr, Info.AlignedSize);
+      }
+      // Allocate target memory
+      void *TgtPtr =
+          Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data());
+      if (TgtPtr == nullptr) {
+        DP("Failed to allocate target memory for private arguments.\n");
+        return OFFLOAD_FAIL;
+      }
+      TgtPtrs.push_back(TgtPtr);
+      DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
+         FirstPrivateArgSize, DPxPTR(TgtPtr));
+      // Transfer data to target device
+      int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
+                                  FirstPrivateArgSize, AsyncInfo);
+      if (Ret != OFFLOAD_SUCCESS) {
+        DP("Failed to submit data of private arguments.\n");
+        return OFFLOAD_FAIL;
+      }
+      // Fill in all placeholder pointers
+      auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
+      for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
+        void *&Ptr = TgtArgs[Info.Index];
+        assert(Ptr == nullptr && "Target pointer is already set by mistaken");
+        Ptr = reinterpret_cast<void *>(TP);
+        TP += Info.AlignedSize;
+        DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
+           "\n",
+           DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
+           DPxPTR(Ptr));
+      }
+    }
+
+    return OFFLOAD_SUCCESS;
   }
 
-  // get target table.
-  TrlTblMtx.lock();
-  assert(TM->Table->TargetsTable.size() > (size_t)device_id &&
-         "Not expecting a device ID outside the table's bounds!");
-  __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id];
-  TrlTblMtx.unlock();
-  assert(TargetTable && "Global data has not been mapped\n");
+  /// Free all target memory allocated for private arguments
+  int free() {
+    for (void *P : TgtPtrs) {
+      int Ret = Device.deleteData(P);
+      if (Ret != OFFLOAD_SUCCESS) {
+        DP("Deallocation of (first-)private arrays failed.\n");
+        return OFFLOAD_FAIL;
+      }
+    }
 
-  // Move data to device.
-  int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes,
-      arg_types);
-  if (rc != OFFLOAD_SUCCESS) {
-    DP("Call to target_data_begin failed, abort target.\n");
+    TgtPtrs.clear();
+
+    return OFFLOAD_SUCCESS;
+  }
+};
+
+/// Process data before launching the kernel, including calling targetDataBegin
+/// to map and transfer data to target device, transferring (first-)private
+/// variables.
+int processDataBefore(int64_t DeviceId, void *HostPtr, int32_t ArgNum,
+                      void **ArgBases, void **Args, int64_t *ArgSizes,
+                      int64_t *ArgTypes, map_var_info_t *ArgNames,
+                      void **ArgMappers, std::vector<void *> &TgtArgs,
+                      std::vector<ptrdiff_t> &TgtOffsets,
+                      PrivateArgumentManagerTy &PrivateArgumentManager,
+                      __tgt_async_info *AsyncInfo) {
+  DeviceTy &Device = PM->Devices[DeviceId];
+  int Ret = targetDataBegin(Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes,
+                            ArgNames, ArgMappers, AsyncInfo);
+  if (Ret != OFFLOAD_SUCCESS) {
+    REPORT("Call to targetDataBegin failed, abort target.\n");
     return OFFLOAD_FAIL;
   }
 
-  std::vector<void *> tgt_args;
-  std::vector<ptrdiff_t> tgt_offsets;
-
   // List of (first-)private arrays allocated for this target region
-  std::vector<void *> fpArrays;
-  std::vector<int> tgtArgsPositions(arg_num, -1);
+  std::vector<int> TgtArgsPositions(ArgNum, -1);
 
-  for (int32_t i = 0; i < arg_num; ++i) {
-    if (!(arg_types[i] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
-      // This is not a target parameter, do not push it into tgt_args.
+  for (int32_t I = 0; I < ArgNum; ++I) {
+    if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
+      // This is not a target parameter, do not push it into TgtArgs.
       // Check for lambda mapping.
-      if (isLambdaMapping(arg_types[i])) {
-        assert((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
+      if (isLambdaMapping(ArgTypes[I])) {
+        assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
                "PTR_AND_OBJ must be also MEMBER_OF.");
-        unsigned idx = member_of(arg_types[i]);
-        int tgtIdx = tgtArgsPositions[idx];
-        assert(tgtIdx != -1 && "Base address must be translated already.");
+        unsigned Idx = getParentIndex(ArgTypes[I]);
+        int TgtIdx = TgtArgsPositions[Idx];
+        assert(TgtIdx != -1 && "Base address must be translated already.");
         // The parent lambda must be processed already and it must be the last
-        // in tgt_args and tgt_offsets arrays.
-        void *HstPtrVal = args[i];
-        void *HstPtrBegin = args_base[i];
-        void *HstPtrBase = args[idx];
+        // in TgtArgs and TgtOffsets arrays.
+        void *HstPtrVal = Args[I];
+        void *HstPtrBegin = ArgBases[I];
+        void *HstPtrBase = Args[Idx];
         bool IsLast, IsHostPtr; // unused.
         void *TgtPtrBase =
-            (void *)((intptr_t)tgt_args[tgtIdx] + tgt_offsets[tgtIdx]);
+            (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
         DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
         uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
         void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
-        void *Pointer_TgtPtrBegin =
-            Device.getTgtPtrBegin(HstPtrVal, arg_sizes[i], IsLast, false,
-                                  IsHostPtr);
-        if (!Pointer_TgtPtrBegin) {
+        void *PointerTgtPtrBegin = Device.getTgtPtrBegin(
+            HstPtrVal, ArgSizes[I], IsLast, false, IsHostPtr);
+        if (!PointerTgtPtrBegin) {
           DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
              DPxPTR(HstPtrVal));
           continue;
         }
-        if (RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+        if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
             TgtPtrBegin == HstPtrBegin) {
           DP("Unified memory is active, no need to map lambda captured"
-             "variable (" DPxMOD ")\n", DPxPTR(HstPtrVal));
+             "variable (" DPxMOD ")\n",
+             DPxPTR(HstPtrVal));
           continue;
         }
         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
-           DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
-        int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin,
-                                    sizeof(void *));
-        if (rt != OFFLOAD_SUCCESS) {
-          DP("Copying data to device failed.\n");
+           DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
+        Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
+                                sizeof(void *), AsyncInfo);
+        if (Ret != OFFLOAD_SUCCESS) {
+          REPORT("Copying data to device failed.\n");
           return OFFLOAD_FAIL;
         }
       }
       continue;
     }
-    void *HstPtrBegin = args[i];
-    void *HstPtrBase = args_base[i];
+    void *HstPtrBegin = Args[I];
+    void *HstPtrBase = ArgBases[I];
     void *TgtPtrBegin;
+    map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
     ptrdiff_t TgtBaseOffset;
     bool IsLast, IsHostPtr; // unused.
-    if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) {
+    if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
       DP("Forwarding first-private value " DPxMOD " to the target construct\n",
-          DPxPTR(HstPtrBase));
+         DPxPTR(HstPtrBase));
       TgtPtrBegin = HstPtrBase;
       TgtBaseOffset = 0;
-    } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) {
-      // Allocate memory for (first-)private array
-      TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID,
-          arg_sizes[i], HstPtrBegin);
-      if (!TgtPtrBegin) {
-        DP ("Data allocation for %sprivate array " DPxMOD " failed, "
-            "abort target.\n",
-            (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
-            DPxPTR(HstPtrBegin));
-        return OFFLOAD_FAIL;
-      }
-      fpArrays.push_back(TgtPtrBegin);
+    } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
-#ifdef OMPTARGET_DEBUG
-      void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
-      DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for "
-          "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n",
-          arg_sizes[i], DPxPTR(TgtPtrBegin),
-          (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
-          DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase));
-#endif
-      // If first-private, copy data from host
-      if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
-        int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]);
-        if (rt != OFFLOAD_SUCCESS) {
-          DP ("Copying data to device failed, failed.\n");
-          return OFFLOAD_FAIL;
-        }
+      // Can be marked for optimization if the next argument(s) do(es) not
+      // depend on this one.
+      const bool IsFirstPrivate =
+          (I >= ArgNum - 1 || !(ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
+      Ret = PrivateArgumentManager.addArg(
+          HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
+          TgtArgs.size(), HstPtrName);
+      if (Ret != OFFLOAD_SUCCESS) {
+        REPORT("Failed to process %sprivate argument " DPxMOD "\n",
+               (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
+        return OFFLOAD_FAIL;
       }
-    } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
-      TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast,
-          false, IsHostPtr);
-      TgtBaseOffset = 0; // no offset for ptrs.
-      DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to "
-         "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase),
-         DPxPTR(HstPtrBase));
     } else {
-      TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast,
-          false, IsHostPtr);
+      if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
+        HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
+      TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast,
+                                          false, IsHostPtr);
       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
 #ifdef OMPTARGET_DEBUG
       void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
       DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
-          DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
+         DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
 #endif
     }
-    tgtArgsPositions[i] = tgt_args.size();
-    tgt_args.push_back(TgtPtrBegin);
-    tgt_offsets.push_back(TgtBaseOffset);
+    TgtArgsPositions[I] = TgtArgs.size();
+    TgtArgs.push_back(TgtPtrBegin);
+    TgtOffsets.push_back(TgtBaseOffset);
   }
 
-  assert(tgt_args.size() == tgt_offsets.size() &&
-      "Size mismatch in arguments and offsets");
+  assert(TgtArgs.size() == TgtOffsets.size() &&
+         "Size mismatch in arguments and offsets");
 
-  // Pop loop trip count
-  uint64_t ltc = 0;
-  TblMapMtx.lock();
-  auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
-  if (I != Device.LoopTripCnt.end()) {
-    ltc = I->second;
-    Device.LoopTripCnt.erase(I);
-    DP("loop trip count is %lu.\n", ltc);
+  // Pack and transfer first-private arguments
+  Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
+  if (Ret != OFFLOAD_SUCCESS) {
+    DP("Failed to pack and transfer first private arguments\n");
+    return OFFLOAD_FAIL;
   }
-  TblMapMtx.unlock();
 
-  // Launch device execution.
-  DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
-      TargetTable->EntriesBegin[TM->Index].name,
-      DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index);
-  if (IsTeamConstruct) {
-    rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr,
-        &tgt_args[0], &tgt_offsets[0], tgt_args.size(), team_num,
-        thread_limit, ltc);
-  } else {
-    rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr,
-        &tgt_args[0], &tgt_offsets[0], tgt_args.size());
+  return OFFLOAD_SUCCESS;
+}
+
+/// Process data after launching the kernel, including transferring data back to
+/// host if needed and deallocating target memory of (first-)private variables.
+int processDataAfter(int64_t DeviceId, void *HostPtr, int32_t ArgNum,
+                     void **ArgBases, void **Args, int64_t *ArgSizes,
+                     int64_t *ArgTypes, map_var_info_t *ArgNames,
+                     void **ArgMappers,
+                     PrivateArgumentManagerTy &PrivateArgumentManager,
+                     __tgt_async_info *AsyncInfo) {
+  DeviceTy &Device = PM->Devices[DeviceId];
+
+  // Move data from device.
+  int Ret = targetDataEnd(Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes,
+                          ArgNames, ArgMappers, AsyncInfo);
+  if (Ret != OFFLOAD_SUCCESS) {
+    REPORT("Call to targetDataEnd failed, abort target.\n");
+    return OFFLOAD_FAIL;
   }
-  if (rc != OFFLOAD_SUCCESS) {
-    DP ("Executing target region abort target.\n");
+
+  // Free target memory for private arguments
+  Ret = PrivateArgumentManager.free();
+  if (Ret != OFFLOAD_SUCCESS) {
+    REPORT("Failed to deallocate target memory for private args\n");
     return OFFLOAD_FAIL;
   }
 
-  // Deallocate (first-)private arrays
-  for (auto it : fpArrays) {
-    int rt = Device.RTL->data_delete(Device.RTLDeviceID, it);
-    if (rt != OFFLOAD_SUCCESS) {
-      DP("Deallocation of (first-)private arrays failed.\n");
-      return OFFLOAD_FAIL;
-    }
+  return OFFLOAD_SUCCESS;
+}
+} // namespace
+
+/// performs the same actions as data_begin in case arg_num is
+/// non-zero and initiates run of the offloaded region on the target platform;
+/// if arg_num is non-zero after the region execution is done it also
+/// performs the same action as data_update and data_end above. This function
+/// returns 0 if it was able to transfer the execution to a target and an
+/// integer different from zero otherwise.
+int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum, void **ArgBases,
+           void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
+           map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum,
+           int32_t ThreadLimit, int IsTeamConstruct) {
+  DeviceTy &Device = PM->Devices[DeviceId];
+
+  TableMap *TM = getTableMap(HostPtr);
+  // No map for this host pointer found!
+  if (!TM) {
+    REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
+           DPxPTR(HostPtr));
+    return OFFLOAD_FAIL;
   }
 
-  // Move data from device.
-  int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes,
-      arg_types);
-  if (rt != OFFLOAD_SUCCESS) {
-    DP("Call to target_data_end failed, abort targe.\n");
+  // get target table.
+  __tgt_target_table *TargetTable = nullptr;
+  {
+    std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
+    assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
+           "Not expecting a device ID outside the table's bounds!");
+    TargetTable = TM->Table->TargetsTable[DeviceId];
+  }
+  assert(TargetTable && "Global data has not been mapped\n");
+
+  __tgt_async_info AsyncInfo;
+
+  std::vector<void *> TgtArgs;
+  std::vector<ptrdiff_t> TgtOffsets;
+
+  PrivateArgumentManagerTy PrivateArgumentManager(Device, &AsyncInfo);
+
+  // Process data, such as data mapping, before launching the kernel
+  int Ret = processDataBefore(DeviceId, HostPtr, ArgNum, ArgBases, Args,
+                              ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs,
+                              TgtOffsets, PrivateArgumentManager, &AsyncInfo);
+  if (Ret != OFFLOAD_SUCCESS) {
+    REPORT("Failed to process data before launching the kernel.\n");
+    return OFFLOAD_FAIL;
+  }
+
+  // Get loop trip count
+  uint64_t LoopTripCount = getLoopTripCount(DeviceId);
+
+  // Launch device execution.
+  void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
+  DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
+     TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
+
+  if (IsTeamConstruct)
+    Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
+                               TgtArgs.size(), TeamNum, ThreadLimit,
+                               LoopTripCount, &AsyncInfo);
+  else
+    Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
+                           TgtArgs.size(), &AsyncInfo);
+
+  if (Ret != OFFLOAD_SUCCESS) {
+    REPORT("Executing target region abort target.\n");
+    return OFFLOAD_FAIL;
+  }
+
+  // Transfer data back and deallocate target memory for (first-)private
+  // variables
+  Ret = processDataAfter(DeviceId, HostPtr, ArgNum, ArgBases, Args, ArgSizes,
+                         ArgTypes, ArgNames, ArgMappers, PrivateArgumentManager,
+                         &AsyncInfo);
+  if (Ret != OFFLOAD_SUCCESS) {
+    REPORT("Failed to process data after launching the kernel.\n");
     return OFFLOAD_FAIL;
   }
 
diff --git a/libomptarget/src/private.h b/libomptarget/src/private.h
index 20470a69b..6a44dd9e4 100644
--- a/libomptarget/src/private.h
+++ b/libomptarget/src/private.h
@@ -13,43 +13,46 @@
 #ifndef _OMPTARGET_PRIVATE_H
 #define _OMPTARGET_PRIVATE_H
 
+#include <Debug.h>
+#include <SourceInfo.h>
 #include <omptarget.h>
 
 #include <cstdint>
 
-extern int target_data_begin(DeviceTy &Device, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types);
+extern int targetDataBegin(DeviceTy &Device, int32_t arg_num, void **args_base,
+                           void **args, int64_t *arg_sizes, int64_t *arg_types,
+                           map_var_info_t *arg_names, void **arg_mappers,
+                           __tgt_async_info *async_info_ptr);
 
-extern int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
-    void **args, int64_t *arg_sizes, int64_t *arg_types);
+extern int targetDataEnd(DeviceTy &Device, int32_t ArgNum, void **ArgBases,
+                         void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
+                         map_var_info_t *arg_names, void **ArgMappers,
+                         __tgt_async_info *AsyncInfo);
 
-extern int target_data_update(DeviceTy &Device, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types);
+extern int targetDataUpdate(DeviceTy &Device, int32_t arg_num, void **args_base,
+                            void **args, int64_t *arg_sizes, int64_t *arg_types,
+                            map_var_info_t *arg_names, void **arg_mappers,
+                            __tgt_async_info *async_info_ptr = nullptr);
 
-extern int target(int64_t device_id, void *host_ptr, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
-    int32_t team_num, int32_t thread_limit, int IsTeamConstruct);
+extern int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum,
+                  void **ArgBases, void **Args, int64_t *ArgSizes,
+                  int64_t *ArgTypes, map_var_info_t *arg_names,
+                  void **ArgMappers, int32_t TeamNum, int32_t ThreadLimit,
+                  int IsTeamConstruct);
 
 extern int CheckDeviceAndCtors(int64_t device_id);
 
-// enum for OMP_TARGET_OFFLOAD; keep in sync with kmp.h definition
-enum kmp_target_offload_kind {
-  tgt_disabled = 0,
-  tgt_default = 1,
-  tgt_mandatory = 2
-};
-typedef enum kmp_target_offload_kind kmp_target_offload_kind_t;
-extern kmp_target_offload_kind_t TargetOffloadPolicy;
-
 // This structure stores information of a mapped memory region.
 struct MapComponentInfoTy {
   void *Base;
   void *Begin;
   int64_t Size;
   int64_t Type;
+  void *Name;
   MapComponentInfoTy() = default;
-  MapComponentInfoTy(void *Base, void *Begin, int64_t Size, int64_t Type)
-      : Base(Base), Begin(Begin), Size(Size), Type(Type) {}
+  MapComponentInfoTy(void *Base, void *Begin, int64_t Size, int64_t Type,
+                     void *Name)
+      : Base(Base), Begin(Begin), Size(Size), Type(Type), Name(Name) {}
 };
 
 // This structure stores all components of a user-defined mapper. The number of
@@ -57,24 +60,22 @@ struct MapComponentInfoTy {
 // implementation here.
 struct MapperComponentsTy {
   std::vector<MapComponentInfoTy> Components;
+  int32_t size() { return Components.size(); }
 };
 
-////////////////////////////////////////////////////////////////////////////////
-// implemtation for fatal messages
-////////////////////////////////////////////////////////////////////////////////
+// The mapper function pointer type. It follows the signature below:
+// void .omp_mapper.<type_name>.<mapper_id>.(void *rt_mapper_handle,
+//                                           void *base, void *begin,
+//                                           size_t size, int64_t type,
+//                                           void * name);
+typedef void (*MapperFuncPtrTy)(void *, void *, void *, int64_t, int64_t,
+                                void *);
 
-#define FATAL_MESSAGE0(_num, _str)                                    \
-  do {                                                                \
-    fprintf(stderr, "Libomptarget fatal error %d: %s\n", _num, _str); \
-    exit(1);                                                          \
-  } while (0)
-
-#define FATAL_MESSAGE(_num, _str, ...)                              \
-  do {                                                              \
-    fprintf(stderr, "Libomptarget fatal error %d:" _str "\n", _num, \
-            __VA_ARGS__);                                           \
-    exit(1);                                                        \
-  } while (0)
+// Function pointer type for target_data_* functions (targetDataBegin,
+// targetDataEnd and targetDataUpdate).
+typedef int (*TargetDataFuncPtrTy)(DeviceTy &, int32_t, void **, void **,
+                                   int64_t *, int64_t *, map_var_info_t *,
+                                   void **, __tgt_async_info *);
 
 // Implemented in libomp, they are called from within __tgt_* functions.
 #ifdef __cplusplus
@@ -89,17 +90,75 @@ int __kmpc_get_target_offload(void) __attribute__((weak));
 }
 #endif
 
-#ifdef OMPTARGET_DEBUG
-extern int DebugLevel;
-
-#define DP(...) \
-  do { \
-    if (DebugLevel > 0) { \
-      DEBUGP("Libomptarget", __VA_ARGS__); \
-    } \
-  } while (false)
-#else // OMPTARGET_DEBUG
-#define DP(...) {}
-#endif // OMPTARGET_DEBUG
+#define TARGET_NAME Libomptarget
+#define DEBUG_PREFIX GETNAME(TARGET_NAME)
+
+////////////////////////////////////////////////////////////////////////////////
+/// dump a table of all the host-target pointer pairs on failure
+static inline void dumpTargetPointerMappings(const ident_t *Loc,
+                                             const DeviceTy &Device) {
+  if (Device.HostDataToTargetMap.empty())
+    return;
+
+  SourceInfo Kernel(Loc);
+  INFO(OMP_INFOTYPE_ALL, Device.DeviceID,
+       "OpenMP Host-Device pointer mappings after block at %s:%d:%d:\n",
+       Kernel.getFilename(), Kernel.getLine(), Kernel.getColumn());
+  INFO(OMP_INFOTYPE_ALL, Device.DeviceID, "%-18s %-18s %s %s %s\n", "Host Ptr",
+       "Target Ptr", "Size (B)", "RefCount", "Declaration");
+  for (const auto &HostTargetMap : Device.HostDataToTargetMap) {
+    SourceInfo Info(HostTargetMap.HstPtrName);
+    INFO(OMP_INFOTYPE_ALL, Device.DeviceID,
+         DPxMOD " " DPxMOD " %-8lu %-8ld %s at %s:%d:%d\n",
+         DPxPTR(HostTargetMap.HstPtrBegin), DPxPTR(HostTargetMap.TgtPtrBegin),
+         (long unsigned)(HostTargetMap.HstPtrEnd - HostTargetMap.HstPtrBegin),
+         HostTargetMap.getRefCount(), Info.getName(), Info.getFilename(),
+         Info.getLine(), Info.getColumn());
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Print out the names and properties of the arguments to each kernel
+static inline void
+printKernelArguments(const ident_t *Loc, const int64_t DeviceId,
+                     const int32_t ArgNum, const int64_t *ArgSizes,
+                     const int64_t *ArgTypes, const map_var_info_t *ArgNames,
+                     const char *RegionType) {
+  SourceInfo info(Loc);
+  INFO(OMP_INFOTYPE_ALL, DeviceId, "%s at %s:%d:%d with %d arguments:\n",
+       RegionType, info.getFilename(), info.getLine(), info.getColumn(),
+       ArgNum);
+
+  for (int32_t i = 0; i < ArgNum; ++i) {
+    const map_var_info_t varName = (ArgNames) ? ArgNames[i] : nullptr;
+    const char *type = nullptr;
+    const char *implicit =
+        (ArgTypes[i] & OMP_TGT_MAPTYPE_IMPLICIT) ? "(implicit)" : "";
+    if (ArgTypes[i] & OMP_TGT_MAPTYPE_TO && ArgTypes[i] & OMP_TGT_MAPTYPE_FROM)
+      type = "tofrom";
+    else if (ArgTypes[i] & OMP_TGT_MAPTYPE_TO)
+      type = "to";
+    else if (ArgTypes[i] & OMP_TGT_MAPTYPE_FROM)
+      type = "from";
+    else if (ArgTypes[i] & OMP_TGT_MAPTYPE_PRIVATE)
+      type = "private";
+    else if (ArgTypes[i] & OMP_TGT_MAPTYPE_LITERAL)
+      type = "firstprivate";
+    else if (ArgSizes[i] != 0)
+      type = "alloc";
+    else
+      type = "use_address";
+
+    INFO(OMP_INFOTYPE_ALL, DeviceId, "%s(%s)[%ld] %s\n", type,
+         getNameFromMapping(varName).c_str(), ArgSizes[i], implicit);
+  }
+}
+
+#ifdef OMPTARGET_PROFILE_ENABLED
+#include "llvm/Support/TimeProfiler.h"
+#define TIMESCOPE() llvm::TimeTraceScope TimeScope(__FUNCTION__)
+#else
+#define TIMESCOPE()
+#endif
 
 #endif
diff --git a/libomptarget/src/rtl.cpp b/libomptarget/src/rtl.cpp
index 35470f587..443359a62 100644
--- a/libomptarget/src/rtl.cpp
+++ b/libomptarget/src/rtl.cpp
@@ -23,30 +23,52 @@
 
 // List of all plugins that can support offloading.
 static const char *RTLNames[] = {
-    /* PowerPC target */ "libomptarget.rtl.ppc64.so",
-    /* x86_64 target  */ "libomptarget.rtl.x86_64.so",
-    /* CUDA target    */ "libomptarget.rtl.cuda.so",
-    /* AArch64 target */ "libomptarget.rtl.aarch64.so"};
+    /* PowerPC target       */ "libomptarget.rtl.ppc64.so",
+    /* x86_64 target        */ "libomptarget.rtl.x86_64.so",
+    /* CUDA target          */ "libomptarget.rtl.cuda.so",
+    /* AArch64 target       */ "libomptarget.rtl.aarch64.so",
+    /* SX-Aurora VE target  */ "libomptarget.rtl.ve.so",
+    /* AMDGPU target        */ "libomptarget.rtl.amdgpu.so",
+};
+
+PluginManager *PM;
+
+#if OMPTARGET_PROFILE_ENABLED
+static char *ProfileTraceFile = nullptr;
+#endif
 
-RTLsTy RTLs;
-std::mutex RTLsMtx;
+__attribute__((constructor(101))) void init() {
+  DP("Init target library!\n");
+  PM = new PluginManager();
 
-HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable;
-std::mutex TrlTblMtx;
+#ifdef OMPTARGET_PROFILE_ENABLED
+  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
+  // TODO: add a configuration option for time granularity
+  if (ProfileTraceFile)
+    llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
+#endif
+}
 
-HostPtrToTableMapTy HostPtrToTableMap;
-std::mutex TblMapMtx;
+__attribute__((destructor(101))) void deinit() {
+  DP("Deinit target library!\n");
+  delete PM;
 
-void RTLsTy::LoadRTLs() {
-#ifdef OMPTARGET_DEBUG
-  if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
-    DebugLevel = std::stoi(envStr);
+#ifdef OMPTARGET_PROFILE_ENABLED
+  if (ProfileTraceFile) {
+    // TODO: add env var for file output
+    if (auto E = llvm::timeTraceProfilerWrite(ProfileTraceFile, "-"))
+      fprintf(stderr, "Error writing out the time trace\n");
+
+    llvm::timeTraceProfilerCleanup();
   }
-#endif // OMPTARGET_DEBUG
+#endif
+}
 
+void RTLsTy::LoadRTLs() {
   // Parse environment variable OMP_TARGET_OFFLOAD (if set)
-  TargetOffloadPolicy = (kmp_target_offload_kind_t) __kmpc_get_target_offload();
-  if (TargetOffloadPolicy == tgt_disabled) {
+  PM->TargetOffloadPolicy =
+      (kmp_target_offload_kind_t)__kmpc_get_target_offload();
+  if (PM->TargetOffloadPolicy == tgt_disabled) {
     return;
   }
 
@@ -66,62 +88,86 @@ void RTLsTy::LoadRTLs() {
 
     DP("Successfully loaded library '%s'!\n", Name);
 
-    // Retrieve the RTL information from the runtime library.
-    RTLInfoTy R;
+    AllRTLs.emplace_back();
 
-    R.LibraryHandler = dynlib_handle;
-    R.isUsed = false;
-
-#ifdef OMPTARGET_DEBUG
-    R.RTLName = Name;
-#endif
-
-    if (!(*((void**) &R.is_valid_binary) = dlsym(
-              dynlib_handle, "__tgt_rtl_is_valid_binary")))
-      continue;
-    if (!(*((void**) &R.number_of_devices) = dlsym(
-              dynlib_handle, "__tgt_rtl_number_of_devices")))
-      continue;
-    if (!(*((void**) &R.init_device) = dlsym(
-              dynlib_handle, "__tgt_rtl_init_device")))
-      continue;
-    if (!(*((void**) &R.load_binary) = dlsym(
-              dynlib_handle, "__tgt_rtl_load_binary")))
-      continue;
-    if (!(*((void**) &R.data_alloc) = dlsym(
-              dynlib_handle, "__tgt_rtl_data_alloc")))
-      continue;
-    if (!(*((void**) &R.data_submit) = dlsym(
-              dynlib_handle, "__tgt_rtl_data_submit")))
-      continue;
-    if (!(*((void**) &R.data_retrieve) = dlsym(
-              dynlib_handle, "__tgt_rtl_data_retrieve")))
-      continue;
-    if (!(*((void**) &R.data_delete) = dlsym(
-              dynlib_handle, "__tgt_rtl_data_delete")))
-      continue;
-    if (!(*((void**) &R.run_region) = dlsym(
-              dynlib_handle, "__tgt_rtl_run_target_region")))
-      continue;
-    if (!(*((void**) &R.run_team_region) = dlsym(
-              dynlib_handle, "__tgt_rtl_run_target_team_region")))
+    // Retrieve the RTL information from the runtime library.
+    RTLInfoTy &R = AllRTLs.back();
+
+    bool ValidPlugin = true;
+
+    if (!(*((void **)&R.is_valid_binary) =
+              dlsym(dynlib_handle, "__tgt_rtl_is_valid_binary")))
+      ValidPlugin = false;
+    if (!(*((void **)&R.number_of_devices) =
+              dlsym(dynlib_handle, "__tgt_rtl_number_of_devices")))
+      ValidPlugin = false;
+    if (!(*((void **)&R.init_device) =
+              dlsym(dynlib_handle, "__tgt_rtl_init_device")))
+      ValidPlugin = false;
+    if (!(*((void **)&R.load_binary) =
+              dlsym(dynlib_handle, "__tgt_rtl_load_binary")))
+      ValidPlugin = false;
+    if (!(*((void **)&R.data_alloc) =
+              dlsym(dynlib_handle, "__tgt_rtl_data_alloc")))
+      ValidPlugin = false;
+    if (!(*((void **)&R.data_submit) =
+              dlsym(dynlib_handle, "__tgt_rtl_data_submit")))
+      ValidPlugin = false;
+    if (!(*((void **)&R.data_retrieve) =
+              dlsym(dynlib_handle, "__tgt_rtl_data_retrieve")))
+      ValidPlugin = false;
+    if (!(*((void **)&R.data_delete) =
+              dlsym(dynlib_handle, "__tgt_rtl_data_delete")))
+      ValidPlugin = false;
+    if (!(*((void **)&R.run_region) =
+              dlsym(dynlib_handle, "__tgt_rtl_run_target_region")))
+      ValidPlugin = false;
+    if (!(*((void **)&R.run_team_region) =
+              dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region")))
+      ValidPlugin = false;
+
+    // Invalid plugin
+    if (!ValidPlugin) {
+      DP("Invalid plugin as necessary interface is not found.\n");
+      AllRTLs.pop_back();
       continue;
-
-    // Optional functions
-    *((void**) &R.init_requires) = dlsym(
-        dynlib_handle, "__tgt_rtl_init_requires");
+    }
 
     // No devices are supported by this RTL?
     if (!(R.NumberOfDevices = R.number_of_devices())) {
+      // The RTL is invalid! Will pop the object from the RTLs list.
       DP("No devices supported in this RTL\n");
+      AllRTLs.pop_back();
       continue;
     }
 
-    DP("Registering RTL %s supporting %d devices!\n",
-        R.RTLName.c_str(), R.NumberOfDevices);
+    R.LibraryHandler = dynlib_handle;
 
-    // The RTL is valid! Will save the information in the RTLs list.
-    AllRTLs.push_back(R);
+#ifdef OMPTARGET_DEBUG
+    R.RTLName = Name;
+#endif
+
+    DP("Registering RTL %s supporting %d devices!\n", R.RTLName.c_str(),
+       R.NumberOfDevices);
+
+    // Optional functions
+    *((void **)&R.init_requires) =
+        dlsym(dynlib_handle, "__tgt_rtl_init_requires");
+    *((void **)&R.data_submit_async) =
+        dlsym(dynlib_handle, "__tgt_rtl_data_submit_async");
+    *((void **)&R.data_retrieve_async) =
+        dlsym(dynlib_handle, "__tgt_rtl_data_retrieve_async");
+    *((void **)&R.run_region_async) =
+        dlsym(dynlib_handle, "__tgt_rtl_run_target_region_async");
+    *((void **)&R.run_team_region_async) =
+        dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region_async");
+    *((void **)&R.synchronize) = dlsym(dynlib_handle, "__tgt_rtl_synchronize");
+    *((void **)&R.data_exchange) =
+        dlsym(dynlib_handle, "__tgt_rtl_data_exchange");
+    *((void **)&R.data_exchange_async) =
+        dlsym(dynlib_handle, "__tgt_rtl_data_exchange_async");
+    *((void **)&R.is_data_exchangable) =
+        dlsym(dynlib_handle, "__tgt_rtl_is_data_exchangable");
   }
 
   DP("RTLs loaded!\n");
@@ -165,7 +211,7 @@ static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc,
     __tgt_device_image *img, RTLInfoTy *RTL) {
 
   for (int32_t i = 0; i < RTL->NumberOfDevices; ++i) {
-    DeviceTy &Device = Devices[RTL->Idx + i];
+    DeviceTy &Device = PM->Devices[RTL->Idx + i];
     Device.PendingGlobalsMtx.lock();
     Device.HasPendingGlobals = true;
     for (__tgt_offload_entry *entry = img->EntriesBegin;
@@ -226,7 +272,7 @@ void RTLsTy::RegisterRequires(int64_t flags) {
 
   // TODO: insert any other missing checks
 
-  DP("New requires flags %ld compatible with existing %ld!\n",
+  DP("New requires flags %" PRId64 " compatible with existing %" PRId64 "!\n",
      flags, RequiresFlags);
 }
 
@@ -234,7 +280,7 @@ void RTLsTy::RegisterLib(__tgt_bin_desc *desc) {
   // Attempt to load all plugins available in the system.
   std::call_once(initFlag, &RTLsTy::LoadRTLs, this);
 
-  RTLsMtx.lock();
+  PM->RTLsMtx.lock();
   // Register the images with the RTLs that understand them, if any.
   for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
     // Obtain the image.
@@ -244,7 +290,7 @@ void RTLsTy::RegisterLib(__tgt_bin_desc *desc) {
 
     // Scan the RTLs that have associated images until we find one that supports
     // the current image.
-    for (auto &R : RTLs.AllRTLs) {
+    for (auto &R : AllRTLs) {
       if (!R.is_valid_binary(img)) {
         DP("Image " DPxMOD " is NOT compatible with RTL %s!\n",
             DPxPTR(img->ImageStart), R.RTLName.c_str());
@@ -258,46 +304,45 @@ void RTLsTy::RegisterLib(__tgt_bin_desc *desc) {
       if (!R.isUsed) {
         // Initialize the device information for the RTL we are about to use.
         DeviceTy device(&R);
-        size_t start = Devices.size();
-        Devices.resize(start + R.NumberOfDevices, device);
+        size_t Start = PM->Devices.size();
+        PM->Devices.resize(Start + R.NumberOfDevices, device);
         for (int32_t device_id = 0; device_id < R.NumberOfDevices;
             device_id++) {
           // global device ID
-          Devices[start + device_id].DeviceID = start + device_id;
+          PM->Devices[Start + device_id].DeviceID = Start + device_id;
           // RTL local device ID
-          Devices[start + device_id].RTLDeviceID = device_id;
+          PM->Devices[Start + device_id].RTLDeviceID = device_id;
         }
 
         // Initialize the index of this RTL and save it in the used RTLs.
-        R.Idx = (RTLs.UsedRTLs.empty())
+        R.Idx = (UsedRTLs.empty())
                     ? 0
-                    : RTLs.UsedRTLs.back()->Idx +
-                          RTLs.UsedRTLs.back()->NumberOfDevices;
-        assert((size_t) R.Idx == start &&
+                    : UsedRTLs.back()->Idx + UsedRTLs.back()->NumberOfDevices;
+        assert((size_t) R.Idx == Start &&
             "RTL index should equal the number of devices used so far.");
         R.isUsed = true;
-        RTLs.UsedRTLs.push_back(&R);
+        UsedRTLs.push_back(&R);
 
         DP("RTL " DPxMOD " has index %d!\n", DPxPTR(R.LibraryHandler), R.Idx);
       }
 
       // Initialize (if necessary) translation table for this library.
-      TrlTblMtx.lock();
-      if(!HostEntriesBeginToTransTable.count(desc->HostEntriesBegin)){
-        TranslationTable &tt =
-            HostEntriesBeginToTransTable[desc->HostEntriesBegin];
-        tt.HostTable.EntriesBegin = desc->HostEntriesBegin;
-        tt.HostTable.EntriesEnd = desc->HostEntriesEnd;
+      PM->TrlTblMtx.lock();
+      if (!PM->HostEntriesBeginToTransTable.count(desc->HostEntriesBegin)) {
+        TranslationTable &TransTable =
+            (PM->HostEntriesBeginToTransTable)[desc->HostEntriesBegin];
+        TransTable.HostTable.EntriesBegin = desc->HostEntriesBegin;
+        TransTable.HostTable.EntriesEnd = desc->HostEntriesEnd;
       }
 
       // Retrieve translation table for this library.
       TranslationTable &TransTable =
-          HostEntriesBeginToTransTable[desc->HostEntriesBegin];
+          (PM->HostEntriesBeginToTransTable)[desc->HostEntriesBegin];
 
       DP("Registering image " DPxMOD " with RTL %s!\n",
           DPxPTR(img->ImageStart), R.RTLName.c_str());
       RegisterImageIntoTranslationTable(TransTable, R, img);
-      TrlTblMtx.unlock();
+      PM->TrlTblMtx.unlock();
       FoundRTL = &R;
 
       // Load ctors/dtors for static objects
@@ -311,8 +356,7 @@ void RTLsTy::RegisterLib(__tgt_bin_desc *desc) {
       DP("No RTL found for image " DPxMOD "!\n", DPxPTR(img->ImageStart));
     }
   }
-  RTLsMtx.unlock();
-
+  PM->RTLsMtx.unlock();
 
   DP("Done registering entries!\n");
 }
@@ -320,7 +364,7 @@ void RTLsTy::RegisterLib(__tgt_bin_desc *desc) {
 void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) {
   DP("Unloading target library!\n");
 
-  RTLsMtx.lock();
+  PM->RTLsMtx.lock();
   // Find which RTL understands each image, if any.
   for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
     // Obtain the image.
@@ -330,7 +374,7 @@ void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) {
 
     // Scan the RTLs that have associated images until we find one that supports
     // the current image. We only need to scan RTLs that are already being used.
-    for (auto *R : RTLs.UsedRTLs) {
+    for (auto *R : UsedRTLs) {
 
       assert(R->isUsed && "Expecting used RTLs.");
 
@@ -348,12 +392,12 @@ void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) {
       // Execute dtors for static objects if the device has been used, i.e.
       // if its PendingCtors list has been emptied.
       for (int32_t i = 0; i < FoundRTL->NumberOfDevices; ++i) {
-        DeviceTy &Device = Devices[FoundRTL->Idx + i];
+        DeviceTy &Device = PM->Devices[FoundRTL->Idx + i];
         Device.PendingGlobalsMtx.lock();
         if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) {
           for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) {
-            int rc = target(Device.DeviceID, dtor, 0, NULL, NULL, NULL, NULL, 1,
-                1, true /*team*/);
+            int rc = target(Device.DeviceID, dtor, 0, nullptr, nullptr, nullptr,
+                            nullptr, nullptr, nullptr, 1, 1, true /*team*/);
             if (rc != OFFLOAD_SUCCESS) {
               DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor));
             }
@@ -376,28 +420,28 @@ void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) {
           DPxPTR(img->ImageStart));
     }
   }
-  RTLsMtx.unlock();
+  PM->RTLsMtx.unlock();
   DP("Done unregistering images!\n");
 
-  // Remove entries from HostPtrToTableMap
-  TblMapMtx.lock();
+  // Remove entries from PM->HostPtrToTableMap
+  PM->TblMapMtx.lock();
   for (__tgt_offload_entry *cur = desc->HostEntriesBegin;
       cur < desc->HostEntriesEnd; ++cur) {
-    HostPtrToTableMap.erase(cur->addr);
+    PM->HostPtrToTableMap.erase(cur->addr);
   }
 
   // Remove translation table for this descriptor.
-  auto tt = HostEntriesBeginToTransTable.find(desc->HostEntriesBegin);
-  if (tt != HostEntriesBeginToTransTable.end()) {
+  auto TransTable = PM->HostEntriesBeginToTransTable.find(desc->HostEntriesBegin);
+  if (TransTable != PM->HostEntriesBeginToTransTable.end()) {
     DP("Removing translation table for descriptor " DPxMOD "\n",
         DPxPTR(desc->HostEntriesBegin));
-    HostEntriesBeginToTransTable.erase(tt);
+    PM->HostEntriesBeginToTransTable.erase(TransTable);
   } else {
     DP("Translation table for descriptor " DPxMOD " cannot be found, probably "
         "it has been already removed.\n", DPxPTR(desc->HostEntriesBegin));
   }
 
-  TblMapMtx.unlock();
+  PM->TblMapMtx.unlock();
 
   // TODO: Remove RTL and the devices it manages if it's not used anymore?
   // TODO: Write some RTL->unload_image(...) function?
diff --git a/libomptarget/src/rtl.h b/libomptarget/src/rtl.h
index 8148e81e7..b9ead48cd 100644
--- a/libomptarget/src/rtl.h
+++ b/libomptarget/src/rtl.h
@@ -13,6 +13,7 @@
 #ifndef _OMPTARGET_RTL_H
 #define _OMPTARGET_RTL_H
 
+#include "omptarget.h"
 #include <list>
 #include <map>
 #include <mutex>
@@ -25,84 +26,74 @@ struct __tgt_bin_desc;
 
 struct RTLInfoTy {
   typedef int32_t(is_valid_binary_ty)(void *);
+  typedef int32_t(is_data_exchangable_ty)(int32_t, int32_t);
   typedef int32_t(number_of_devices_ty)();
   typedef int32_t(init_device_ty)(int32_t);
   typedef __tgt_target_table *(load_binary_ty)(int32_t, void *);
   typedef void *(data_alloc_ty)(int32_t, int64_t, void *);
   typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t);
+  typedef int32_t(data_submit_async_ty)(int32_t, void *, void *, int64_t,
+                                        __tgt_async_info *);
   typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t);
+  typedef int32_t(data_retrieve_async_ty)(int32_t, void *, void *, int64_t,
+                                          __tgt_async_info *);
+  typedef int32_t(data_exchange_ty)(int32_t, void *, int32_t, void *, int64_t);
+  typedef int32_t(data_exchange_async_ty)(int32_t, void *, int32_t, void *,
+                                          int64_t, __tgt_async_info *);
   typedef int32_t(data_delete_ty)(int32_t, void *);
   typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *,
                                  int32_t);
+  typedef int32_t(run_region_async_ty)(int32_t, void *, void **, ptrdiff_t *,
+                                       int32_t, __tgt_async_info *);
   typedef int32_t(run_team_region_ty)(int32_t, void *, void **, ptrdiff_t *,
                                       int32_t, int32_t, int32_t, uint64_t);
+  typedef int32_t(run_team_region_async_ty)(int32_t, void *, void **,
+                                            ptrdiff_t *, int32_t, int32_t,
+                                            int32_t, uint64_t,
+                                            __tgt_async_info *);
   typedef int64_t(init_requires_ty)(int64_t);
+  typedef int64_t(synchronize_ty)(int32_t, __tgt_async_info *);
 
-  int32_t Idx;                     // RTL index, index is the number of devices
-                                   // of other RTLs that were registered before,
-                                   // i.e. the OpenMP index of the first device
-                                   // to be registered with this RTL.
-  int32_t NumberOfDevices;         // Number of devices this RTL deals with.
+  int32_t Idx = -1;             // RTL index, index is the number of devices
+                                // of other RTLs that were registered before,
+                                // i.e. the OpenMP index of the first device
+                                // to be registered with this RTL.
+  int32_t NumberOfDevices = -1; // Number of devices this RTL deals with.
 
-  void *LibraryHandler;
+  void *LibraryHandler = nullptr;
 
 #ifdef OMPTARGET_DEBUG
   std::string RTLName;
 #endif
 
   // Functions implemented in the RTL.
-  is_valid_binary_ty *is_valid_binary;
-  number_of_devices_ty *number_of_devices;
-  init_device_ty *init_device;
-  load_binary_ty *load_binary;
-  data_alloc_ty *data_alloc;
-  data_submit_ty *data_submit;
-  data_retrieve_ty *data_retrieve;
-  data_delete_ty *data_delete;
-  run_region_ty *run_region;
-  run_team_region_ty *run_team_region;
-  init_requires_ty *init_requires;
+  is_valid_binary_ty *is_valid_binary = nullptr;
+  is_data_exchangable_ty *is_data_exchangable = nullptr;
+  number_of_devices_ty *number_of_devices = nullptr;
+  init_device_ty *init_device = nullptr;
+  load_binary_ty *load_binary = nullptr;
+  data_alloc_ty *data_alloc = nullptr;
+  data_submit_ty *data_submit = nullptr;
+  data_submit_async_ty *data_submit_async = nullptr;
+  data_retrieve_ty *data_retrieve = nullptr;
+  data_retrieve_async_ty *data_retrieve_async = nullptr;
+  data_exchange_ty *data_exchange = nullptr;
+  data_exchange_async_ty *data_exchange_async = nullptr;
+  data_delete_ty *data_delete = nullptr;
+  run_region_ty *run_region = nullptr;
+  run_region_async_ty *run_region_async = nullptr;
+  run_team_region_ty *run_team_region = nullptr;
+  run_team_region_async_ty *run_team_region_async = nullptr;
+  init_requires_ty *init_requires = nullptr;
+  synchronize_ty *synchronize = nullptr;
 
   // Are there images associated with this RTL.
-  bool isUsed;
+  bool isUsed = false;
 
   // Mutex for thread-safety when calling RTL interface functions.
   // It is easier to enforce thread-safety at the libomptarget level,
   // so that developers of new RTLs do not have to worry about it.
   std::mutex Mtx;
-
-  // The existence of the mutex above makes RTLInfoTy non-copyable.
-  // We need to provide a copy constructor explicitly.
-  RTLInfoTy()
-      : Idx(-1), NumberOfDevices(-1), LibraryHandler(0),
-#ifdef OMPTARGET_DEBUG
-        RTLName(),
-#endif
-        is_valid_binary(0), number_of_devices(0), init_device(0),
-        load_binary(0), data_alloc(0), data_submit(0), data_retrieve(0),
-        data_delete(0), run_region(0), run_team_region(0),
-        init_requires(0), isUsed(false), Mtx() {}
-
-  RTLInfoTy(const RTLInfoTy &r) : Mtx() {
-    Idx = r.Idx;
-    NumberOfDevices = r.NumberOfDevices;
-    LibraryHandler = r.LibraryHandler;
-#ifdef OMPTARGET_DEBUG
-    RTLName = r.RTLName;
-#endif
-    is_valid_binary = r.is_valid_binary;
-    number_of_devices = r.number_of_devices;
-    init_device = r.init_device;
-    load_binary = r.load_binary;
-    data_alloc = r.data_alloc;
-    data_submit = r.data_submit;
-    data_retrieve = r.data_retrieve;
-    data_delete = r.data_delete;
-    run_region = r.run_region;
-    run_team_region = r.run_team_region;
-    init_requires = r.init_requires;
-    isUsed = r.isUsed;
-  }
 };
 
 /// RTLs identified in the system.
@@ -121,9 +112,9 @@ class RTLsTy {
   // binaries.
   std::vector<RTLInfoTy *> UsedRTLs;
 
-  int64_t RequiresFlags;
+  int64_t RequiresFlags = OMP_REQ_UNDEFINED;
 
-  explicit RTLsTy() {}
+  explicit RTLsTy() = default;
 
   // Register the clauses of the requires directive.
   void RegisterRequires(int64_t flags);
@@ -134,8 +125,6 @@ class RTLsTy {
   // Unregister a shared library from all RTLs.
   void UnregisterLib(__tgt_bin_desc *desc);
 };
-extern RTLsTy RTLs;
-extern std::mutex RTLsMtx;
 
 
 /// Map between the host entry begin and the translation table. Each
@@ -153,19 +142,15 @@ struct TranslationTable {
 };
 typedef std::map<__tgt_offload_entry *, TranslationTable>
     HostEntriesBeginToTransTableTy;
-extern HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable;
-extern std::mutex TrlTblMtx;
 
 /// Map between the host ptr and a table index
 struct TableMap {
-  TranslationTable *Table; // table associated with the host ptr.
-  uint32_t Index; // index in which the host ptr translated entry is found.
-  TableMap() : Table(0), Index(0) {}
+  TranslationTable *Table = nullptr; // table associated with the host ptr.
+  uint32_t Index = 0; // index in which the host ptr translated entry is found.
+  TableMap() = default;
   TableMap(TranslationTable *table, uint32_t index)
       : Table(table), Index(index) {}
 };
 typedef std::map<void *, TableMap> HostPtrToTableMapTy;
-extern HostPtrToTableMapTy HostPtrToTableMap;
-extern std::mutex TblMapMtx;
 
 #endif
diff --git a/libomptarget/test/CMakeLists.txt b/libomptarget/test/CMakeLists.txt
index 607801e50..bc720aee6 100644
--- a/libomptarget/test/CMakeLists.txt
+++ b/libomptarget/test/CMakeLists.txt
@@ -12,7 +12,7 @@ else()
   set(LIBOMPTARGET_DEBUG False)
 endif()
 
-add_openmp_testsuite(check-libomptarget "Running libomptarget tests" ${CMAKE_CURRENT_BINARY_DIR} DEPENDS omptarget omp)
+add_openmp_testsuite(check-bolt-libomptarget "Running libomptarget tests" ${CMAKE_CURRENT_BINARY_DIR} DEPENDS bolt-omptarget bolt-omp ${LIBBOLTTARGET_TESTED_PLUGINS})
 
 # Configure the lit.site.cfg.in file
 set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget configuration.\n# Do not edit!")
diff --git a/libomptarget/test/api/omp_get_num_devices_with_empty_target.c b/libomptarget/test/api/omp_get_num_devices_with_empty_target.c
new file mode 100644
index 000000000..85dcb73f1
--- /dev/null
+++ b/libomptarget/test/api/omp_get_num_devices_with_empty_target.c
@@ -0,0 +1,30 @@
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+#include <omp.h>
+#include <stdio.h>
+
+static int test_omp_get_num_devices_with_empty_target() {
+  /* checks that omp_get_num_devices() > 0 */
+  return omp_get_num_devices() > 0;
+}
+
+int main() {
+  int failed = 0;
+
+  if (!test_omp_get_num_devices_with_empty_target()) {
+    ++failed;
+  }
+
+  if (failed) {
+    printf("FAIL\n");
+  } else {
+    printf("PASS\n");
+  }
+
+  return failed;
+}
+
+// CHECK: PASS
diff --git a/libomptarget/test/env/base_ptr_ref_count.c b/libomptarget/test/env/base_ptr_ref_count.c
new file mode 100644
index 000000000..403695418
--- /dev/null
+++ b/libomptarget/test/env/base_ptr_ref_count.c
@@ -0,0 +1,51 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-nvptx64-nvidia-cuda 2>&1 | %fcheck-nvptx64-nvidia-cuda
+// REQUIRES: libomptarget-debug
+
+#include <stdlib.h>
+
+int *allocate(size_t n) {
+  int *ptr = malloc(sizeof(int) * n);
+#pragma omp target enter data map(to : ptr[:n])
+  return ptr;
+}
+
+void deallocate(int *ptr, size_t n) {
+#pragma omp target exit data map(delete : ptr[:n])
+  free(ptr);
+}
+
+#pragma omp declare target
+int *cnt;
+void foo() {
+  ++(*cnt);
+}
+#pragma omp end declare target
+
+int main(void) {
+  int *A = allocate(10);
+  int *V = allocate(10);
+  deallocate(A, 10);
+  deallocate(V, 10);
+// CHECK-NOT: RefCount=2
+  cnt = malloc(sizeof(int));
+  *cnt = 0;
+#pragma omp target map(cnt[:1])
+  foo();
+  printf("Cnt = %d.\n", *cnt);
+// CHECK: Cnt = 1.
+  *cnt = 0;
+#pragma omp target data map(cnt[:1])
+#pragma omp target
+  foo();
+  printf("Cnt = %d.\n", *cnt);
+// CHECK: Cnt = 1.
+  free(cnt);
+
+  return 0;
+}
+
+
diff --git a/libomptarget/test/env/omp_target_debug.c b/libomptarget/test/env/omp_target_debug.c
index ce84c9842..34a71793d 100644
--- a/libomptarget/test/env/omp_target_debug.c
+++ b/libomptarget/test/env/omp_target_debug.c
@@ -6,6 +6,8 @@
 // RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=NDEBUG
 // RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG
 // RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=NDEBUG
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-nvptx64-nvidia-cuda 2>&1 | %fcheck-nvptx64-nvidia-cuda -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-nvptx64-nvidia-cuda 2>&1 | %fcheck-nvptx64-nvidia-cuda -allow-empty -check-prefix=NDEBUG
 // REQUIRES: libomptarget-debug
 
 int main(void) {
diff --git a/libomptarget/test/lit.cfg b/libomptarget/test/lit.cfg
index 43116055c..d3e33e176 100644
--- a/libomptarget/test/lit.cfg
+++ b/libomptarget/test/lit.cfg
@@ -9,6 +9,14 @@ if 'PYLINT_IMPORT' in os.environ:
     config = object()
     lit_config = object()
 
+# Use the CUDA device as suggested by the env
+if 'CUDA_VISIBLE_DEVICES' in os.environ:
+    config.environment['CUDA_VISIBLE_DEVICES'] = os.environ['CUDA_VISIBLE_DEVICES']
+
+# Allow running the tests with omptarget debug output
+if 'LIBOMPTARGET_DEBUG' in os.environ:
+    config.environment['LIBOMPTARGET_DEBUG'] = os.environ['LIBOMPTARGET_DEBUG']
+
 def append_dynamic_library_path(name, value, sep):
     if name in config.environment:
         config.environment[name] = value + sep + config.environment[name]
@@ -16,7 +24,7 @@ def append_dynamic_library_path(name, value, sep):
         config.environment[name] = value
 
 # name: The name of this test suite.
-config.name = 'libomptarget'
+config.name = 'bolt-libomptarget'
 
 # suffixes: A list of file extensions to treat as test files.
 config.suffixes = ['.c', '.cpp', '.cc']
@@ -63,6 +71,15 @@ else: # Unices
     append_dynamic_library_path('LD_LIBRARY_PATH', config.library_dir, ":")
     append_dynamic_library_path('LD_LIBRARY_PATH', \
         config.omp_host_rtl_directory, ":")
+    append_dynamic_library_path('LIBRARY_PATH', config.library_dir, ":")
+    append_dynamic_library_path('LIBRARY_PATH', \
+        config.omp_host_rtl_directory, ":")
+
+# Setup flags for BOLT.  If BOLT is not used, they are ignored.
+# Some tasking tests require larger stack size.
+config.environment['ABT_THREAD_STACKSIZE'] = "262144"
+# Sleep alleviates oversubscription overheads when -j is specified.
+config.environment['KMP_ABT_SCHED_SLEEP'] = "1"
 
 # substitutions
 # - for targets that exist in the system create the actual command.
@@ -101,6 +118,9 @@ for libomptarget_target in config.libomptarget_all_targets:
         config.substitutions.append(("%libomptarget-run-" + \
             libomptarget_target, \
             "%t-" + libomptarget_target))
+        config.substitutions.append(("%libomptarget-run-fail-" + \
+            libomptarget_target, \
+            "%not --crash %t-" + libomptarget_target))
         config.substitutions.append(("%clangxx-" + libomptarget_target, \
             "%clangxx %openmp_flags %flags -fopenmp-targets=" + libomptarget_target))
         config.substitutions.append(("%clang-" + libomptarget_target, \
@@ -129,6 +149,9 @@ for libomptarget_target in config.libomptarget_all_targets:
         config.substitutions.append(("%libomptarget-run-" + \
             libomptarget_target, \
             "echo ignored-command"))
+        config.substitutions.append(("%libomptarget-run-fail-" + \
+            libomptarget_target, \
+            "echo ignored-command"))
         config.substitutions.append(("%clang-" + libomptarget_target, \
             "echo ignored-command"))
         config.substitutions.append(("%clangxx-" + libomptarget_target, \
@@ -140,3 +163,4 @@ config.substitutions.append(("%clangxx", config.test_cxx_compiler))
 config.substitutions.append(("%clang", config.test_c_compiler))
 config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
 config.substitutions.append(("%flags", config.test_flags))
+config.substitutions.append(("%not", config.libomptarget_not))
diff --git a/libomptarget/test/lit.site.cfg.in b/libomptarget/test/lit.site.cfg.in
index 26ef4920d..1de4e09b5 100644
--- a/libomptarget/test/lit.site.cfg.in
+++ b/libomptarget/test/lit.site.cfg.in
@@ -11,8 +11,9 @@ config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@"
 config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@"
 config.operating_system = "@CMAKE_SYSTEM_NAME@"
 config.libomptarget_all_targets = "@LIBOMPTARGET_ALL_TARGETS@".split()
-config.libomptarget_system_targets = "@LIBOMPTARGET_SYSTEM_TARGETS@".split()
+config.libomptarget_system_targets = "@LIBBOLTTARGET_SYSTEM_TARGETS@".split()
 config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
+config.libomptarget_not = "@OPENMP_NOT_EXECUTABLE@"
 config.libomptarget_debug = @LIBOMPTARGET_DEBUG@
 
 # Let the main config do the real work.
diff --git a/libomptarget/test/mapping/alloc_fail.c b/libomptarget/test/mapping/alloc_fail.c
new file mode 100644
index 000000000..0ecafeb6a
--- /dev/null
+++ b/libomptarget/test/mapping/alloc_fail.c
@@ -0,0 +1,32 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-run-fail-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-run-fail-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-run-fail-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu
+// RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda
+// RUN: %libomptarget-run-fail-nvptx64-nvidia-cuda 2>&1 \
+// RUN: | %fcheck-nvptx64-nvidia-cuda
+
+// CHECK: Libomptarget message: explicit extension not allowed: host address specified is 0x{{.*}} (8 bytes), but device allocation maps to host at 0x{{.*}} (8 bytes)
+// CHECK: Libomptarget error: Call to getOrAllocTgtPtr returned null pointer (device failure or illegal mapping).
+// CHECK: Libomptarget fatal error 1: failure of target construct while offloading is mandatory
+// UNSUPPORTED: clang-11
+
+int main() {
+  int arr[4] = {0, 1, 2, 3};
+#pragma omp target data map(alloc: arr[0:2])
+#pragma omp target data map(alloc: arr[1:2])
+  ;
+  return 0;
+}
diff --git a/libomptarget/test/mapping/declare_mapper_api.cpp b/libomptarget/test/mapping/declare_mapper_api.cpp
index 275b6c3c5..eda0c86ee 100644
--- a/libomptarget/test/mapping/declare_mapper_api.cpp
+++ b/libomptarget/test/mapping/declare_mapper_api.cpp
@@ -2,10 +2,12 @@
 // RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
 // RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
 // RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
 
 #include <cstdio>
 #include <cstdlib>
 #include <vector>
+#include <cinttypes>
 
 // Data structure definitions copied from OpenMP RTL.
 struct MapComponentInfoTy {
@@ -13,9 +15,10 @@ struct MapComponentInfoTy {
   void *Begin;
   int64_t Size;
   int64_t Type;
+  void *Name;
   MapComponentInfoTy() = default;
-  MapComponentInfoTy(void *Base, void *Begin, int64_t Size, int64_t Type)
-      : Base(Base), Begin(Begin), Size(Size), Type(Type) {}
+  MapComponentInfoTy(void *Base, void *Begin, int64_t Size, int64_t Type, void *Name)
+      : Base(Base), Begin(Begin), Size(Size), Type(Type), Name(Name) {}
 };
 
 struct MapperComponentsTy {
@@ -28,7 +31,8 @@ extern "C" {
 #endif
 int64_t __tgt_mapper_num_components(void *rt_mapper_handle);
 void __tgt_push_mapper_component(void *rt_mapper_handle, void *base,
-                                 void *begin, int64_t size, int64_t type);
+                                 void *begin, int64_t size, int64_t type, 
+                                 void *name);
 #ifdef __cplusplus
 }
 #endif
@@ -38,10 +42,10 @@ int main(int argc, char *argv[]) {
   void *base, *begin;
   int64_t size, type;
   // Push 2 elements into MC.
-  __tgt_push_mapper_component((void *)&MC, base, begin, size, type);
-  __tgt_push_mapper_component((void *)&MC, base, begin, size, type);
+  __tgt_push_mapper_component((void *)&MC, base, begin, size, type, nullptr);
+  __tgt_push_mapper_component((void *)&MC, base, begin, size, type, nullptr);
   int64_t num = __tgt_mapper_num_components((void *)&MC);
   // CHECK: num=2
-  printf("num=%lld\n", num);
+  printf("num=%" PRId64 "\n", num);
   return 0;
 }
diff --git a/libomptarget/test/mapping/declare_mapper_target.cpp b/libomptarget/test/mapping/declare_mapper_target.cpp
new file mode 100644
index 000000000..096431f6e
--- /dev/null
+++ b/libomptarget/test/mapping/declare_mapper_target.cpp
@@ -0,0 +1,37 @@
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
+
+#include <cstdio>
+#include <cstdlib>
+
+#define NUM 1024
+
+class C {
+public:
+  int *a;
+};
+
+#pragma omp declare mapper(id: C s) map(s.a[0:NUM])
+
+int main() {
+  C c;
+  c.a = (int*) malloc(sizeof(int)*NUM);
+  for (int i = 0; i < NUM; i++) {
+    c.a[i] = 1;
+  }
+  #pragma omp target teams distribute parallel for map(mapper(id),tofrom: c)
+  for (int i = 0; i < NUM; i++) {
+    ++c.a[i];
+  }
+  int sum = 0;
+  for (int i = 0; i < NUM; i++) {
+    sum += c.a[i];
+  }
+  // CHECK: Sum = 2048
+  printf("Sum = %d\n", sum);
+  return 0;
+}
+
diff --git a/libomptarget/test/mapping/declare_mapper_target_data.cpp b/libomptarget/test/mapping/declare_mapper_target_data.cpp
new file mode 100644
index 000000000..e2636f157
--- /dev/null
+++ b/libomptarget/test/mapping/declare_mapper_target_data.cpp
@@ -0,0 +1,40 @@
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
+
+#include <cstdio>
+#include <cstdlib>
+
+#define NUM 1024
+
+class C {
+public:
+  int *a;
+};
+
+#pragma omp declare mapper(id: C s) map(s.a[0:NUM])
+
+int main() {
+  C c;
+  c.a = (int*) malloc(sizeof(int)*NUM);
+  for (int i = 0; i < NUM; i++) {
+    c.a[i] = 1;
+  }
+  #pragma omp target data map(mapper(id),tofrom: c)
+  {
+  #pragma omp target teams distribute parallel for
+  for (int i = 0; i < NUM; i++) {
+    ++c.a[i];
+  }
+  }
+  int sum = 0;
+  for (int i = 0; i < NUM; i++) {
+    sum += c.a[i];
+  }
+  // CHECK: Sum = 2048
+  printf("Sum = %d\n", sum);
+  return 0;
+}
+
diff --git a/libomptarget/test/mapping/declare_mapper_target_data_enter_exit.cpp b/libomptarget/test/mapping/declare_mapper_target_data_enter_exit.cpp
new file mode 100644
index 000000000..44fcc20e8
--- /dev/null
+++ b/libomptarget/test/mapping/declare_mapper_target_data_enter_exit.cpp
@@ -0,0 +1,39 @@
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
+
+#include <cstdio>
+#include <cstdlib>
+
+#define NUM 1024
+
+class C {
+public:
+  int *a;
+};
+
+#pragma omp declare mapper(id: C s) map(s.a[0:NUM])
+
+int main() {
+  C c;
+  c.a = (int*) malloc(sizeof(int)*NUM);
+  for (int i = 0; i < NUM; i++) {
+    c.a[i] = 1;
+  }
+  #pragma omp target enter data map(mapper(id),to: c)
+  #pragma omp target teams distribute parallel for
+  for (int i = 0; i < NUM; i++) {
+    ++c.a[i];
+  }
+  #pragma omp target exit data map(mapper(id),from: c)
+  int sum = 0;
+  for (int i = 0; i < NUM; i++) {
+    sum += c.a[i];
+  }
+  // CHECK: Sum = 2048
+  printf("Sum = %d\n", sum);
+  return 0;
+}
+
diff --git a/libomptarget/test/mapping/declare_mapper_target_update.cpp b/libomptarget/test/mapping/declare_mapper_target_update.cpp
new file mode 100644
index 000000000..c894c92dc
--- /dev/null
+++ b/libomptarget/test/mapping/declare_mapper_target_update.cpp
@@ -0,0 +1,61 @@
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
+
+#include <cstdio>
+#include <cstdlib>
+
+#define NUM 1024
+
+class C {
+public:
+  int *a;
+};
+
+#pragma omp declare mapper(id: C s) map(s.a[0:NUM])
+
+int main() {
+  C c;
+  int sum = 0;
+  c.a = (int*) malloc(sizeof(int)*NUM);
+  for (int i = 0; i < NUM; i++) {
+    c.a[i] = 1;
+  }
+  #pragma omp target enter data map(mapper(id),alloc: c)
+  #pragma omp target teams distribute parallel for
+  for (int i = 0; i < NUM; i++) {
+    c.a[i] = 0;
+  }
+  #pragma omp target update from(mapper(id): c)
+  for (int i = 0; i < NUM; i++) {
+    sum += c.a[i];
+  }
+  // CHECK: Sum (after first update from) = 0
+  printf("Sum (after first update from) = %d\n", sum);
+  for (int i = 0; i < NUM; i++) {
+    c.a[i] = 1;
+  }
+  #pragma omp target update to(mapper(id): c)
+  #pragma omp target teams distribute parallel for
+  for (int i = 0; i < NUM; i++) {
+    ++c.a[i];
+  }
+  sum = 0;
+  for (int i = 0; i < NUM; i++) {
+    sum += c.a[i];
+  }
+  // CHECK: Sum (after update to) = 1024
+  printf("Sum (after update to) = %d\n", sum);
+  #pragma omp target update from(mapper(id): c)
+  sum = 0;
+  for (int i = 0; i < NUM; i++) {
+    sum += c.a[i];
+  }
+  // CHECK: Sum (after second update from) = 2048
+  printf("Sum (after second update from) = %d\n", sum);
+  #pragma omp target exit data map(mapper(id),delete: c)
+  return 0;
+}
+
diff --git a/libomptarget/test/mapping/delete_inf_refcount.c b/libomptarget/test/mapping/delete_inf_refcount.c
new file mode 100644
index 000000000..e6b6e9432
--- /dev/null
+++ b/libomptarget/test/mapping/delete_inf_refcount.c
@@ -0,0 +1,33 @@
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
+
+#include <stdio.h>
+#include <omp.h>
+
+#pragma omp declare target
+int isHost;
+#pragma omp end declare target
+
+int main(void) {
+  isHost = -1;
+
+#pragma omp target enter data map(to: isHost)
+
+#pragma omp target
+  { isHost = omp_is_initial_device(); }
+#pragma omp target update from(isHost)
+
+  if (isHost < 0) {
+    printf("Runtime error, isHost=%d\n", isHost);
+  }
+
+#pragma omp target exit data map(delete: isHost)
+
+  // CHECK: Target region executed on the device
+  printf("Target region executed on the %s\n", isHost ? "host" : "device");
+
+  return isHost;
+}
diff --git a/libomptarget/test/mapping/lambda_mapping.cpp b/libomptarget/test/mapping/lambda_mapping.cpp
new file mode 100644
index 000000000..23671ebb7
--- /dev/null
+++ b/libomptarget/test/mapping/lambda_mapping.cpp
@@ -0,0 +1,53 @@
+// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
+
+#include <iostream>
+
+template <typename LOOP_BODY>
+inline void forall(int Begin, int End, LOOP_BODY LoopBody) {
+#pragma omp target parallel for schedule(static)
+  for (int I = Begin; I < End; ++I) {
+    LoopBody(I);
+  }
+}
+
+#define N (1000)
+
+//
+// Demonstration of the RAJA abstraction using lambdas
+// Requires data mapping onto the target section
+//
+int main() {
+  double A[N], B[N], C[N];
+
+  for (int I = 0; I < N; I++) {
+    A[I] = I + 1;
+    B[I] = -I;
+    C[I] = -9;
+  }
+
+#pragma omp target data map(tofrom : C [0:N]) map(to : A [0:N], B [0:N])
+  {
+    forall(0, N, [&](int I) { C[I] += A[I] + B[I]; });
+  }
+
+  int Fail = 0;
+  for (int I = 0; I < N; I++) {
+    if (C[I] != -8) {
+      std::cout << "Failed at " << I << " with val " << C[I] << std::endl;
+      Fail = 1;
+    }
+  }
+
+  // CHECK: Succeeded
+  if (Fail) {
+    std::cout << "Failed" << std::endl;
+  } else {
+    std::cout << "Succeeded" << std::endl;
+  }
+
+  return 0;
+}
diff --git a/libomptarget/test/mapping/pr38704.c b/libomptarget/test/mapping/pr38704.c
index 3e7135e28..c4e80ca44 100644
--- a/libomptarget/test/mapping/pr38704.c
+++ b/libomptarget/test/mapping/pr38704.c
@@ -2,6 +2,7 @@
 // RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
 // RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
 // RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
 
 // Clang 6.0 doesn't use the new map interface, undefined behavior when
 // the compiler emits "old" interface code for structures.
diff --git a/libomptarget/test/mapping/present/target.c b/libomptarget/test/mapping/present/target.c
new file mode 100644
index 000000000..b37eca2bc
--- /dev/null
+++ b/libomptarget/test/mapping/present/target.c
@@ -0,0 +1,45 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+
+int main() {
+  int i;
+
+  // CHECK: addr=0x[[#%x,HOST_ADDR:]], size=[[#%u,SIZE:]]
+  fprintf(stderr, "addr=%p, size=%ld\n", &i, sizeof i);
+
+  // CHECK-NOT: Libomptarget
+#pragma omp target data map(alloc: i)
+#pragma omp target map(present, alloc: i)
+  ;
+
+  // CHECK: i is present
+  fprintf(stderr, "i is present\n");
+
+  // CHECK: Libomptarget message: device mapping required by 'present' map type modifier does not exist for host address 0x{{0*}}[[#HOST_ADDR]] ([[#SIZE]] bytes)
+  // CHECK: Libomptarget error: Call to getOrAllocTgtPtr returned null pointer ('present' map type modifier).
+  // CHECK: Libomptarget error: Call to targetDataBegin failed, abort target.
+  // CHECK: Libomptarget error: Failed to process data before launching the kernel.
+  // CHECK: Libomptarget fatal error 1: failure of target construct while offloading is mandatory
+#pragma omp target map(present, alloc: i)
+  ;
+
+  // CHECK-NOT: i is present
+  fprintf(stderr, "i is present\n");
+
+  return 0;
+}
diff --git a/libomptarget/test/mapping/present/target_array_extension.c b/libomptarget/test/mapping/present/target_array_extension.c
new file mode 100644
index 000000000..e02319c98
--- /dev/null
+++ b/libomptarget/test/mapping/present/target_array_extension.c
@@ -0,0 +1,115 @@
+// --------------------------------------------------
+// Check extends before
+// --------------------------------------------------
+
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-fail-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-fail-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-fail-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+// --------------------------------------------------
+// Check extends after
+// --------------------------------------------------
+
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=AFTER
+// RUN: %libomptarget-run-fail-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=AFTER
+// RUN: %libomptarget-run-fail-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=AFTER
+// RUN: %libomptarget-run-fail-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=AFTER
+// RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+// END.
+
+#include <stdio.h>
+
+#define BEFORE 0
+#define AFTER  1
+
+#define SIZE 100
+
+#if EXTENDS == BEFORE
+# define SMALL_BEG (SIZE-2)
+# define SMALL_END SIZE
+# define LARGE_BEG 0
+# define LARGE_END SIZE
+#elif EXTENDS == AFTER
+# define SMALL_BEG 0
+# define SMALL_END 2
+# define LARGE_BEG 0
+# define LARGE_END SIZE
+#else
+# error EXTENDS undefined
+#endif
+
+#define SMALL_SIZE (SMALL_END-SMALL_BEG)
+#define LARGE_SIZE (LARGE_END-LARGE_BEG)
+
+#define SMALL SMALL_BEG:SMALL_SIZE
+#define LARGE LARGE_BEG:LARGE_SIZE
+
+int main() {
+  int arr[SIZE];
+
+  // CHECK: addr=0x[[#%x,SMALL_ADDR:]], size=[[#%u,SMALL_BYTES:]]
+  fprintf(stderr, "addr=%p, size=%ld\n", &arr[SMALL_BEG],
+          SMALL_SIZE * sizeof arr[0]);
+
+  // CHECK: addr=0x[[#%x,LARGE_ADDR:]], size=[[#%u,LARGE_BYTES:]]
+  fprintf(stderr, "addr=%p, size=%ld\n", &arr[LARGE_BEG],
+          LARGE_SIZE * sizeof arr[0]);
+
+  // CHECK-NOT: Libomptarget
+#pragma omp target data map(alloc: arr[LARGE])
+  {
+#pragma omp target map(present, tofrom: arr[SMALL])
+    ;
+  }
+
+  // CHECK: arr is present
+  fprintf(stderr, "arr is present\n");
+
+  // CHECK: Libomptarget message: explicit extension not allowed: host address specified is 0x{{0*}}[[#LARGE_ADDR]] ([[#LARGE_BYTES]] bytes), but device allocation maps to host at 0x{{0*}}[[#SMALL_ADDR]] ([[#SMALL_BYTES]] bytes)
+  // CHECK: Libomptarget message: device mapping required by 'present' map type modifier does not exist for host address 0x{{0*}}[[#LARGE_ADDR]] ([[#LARGE_BYTES]] bytes)
+  // CHECK: Libomptarget error: Call to getOrAllocTgtPtr returned null pointer ('present' map type modifier).
+  // CHECK: Libomptarget error: Call to targetDataBegin failed, abort target.
+  // CHECK: Libomptarget error: Failed to process data before launching the kernel.
+  // CHECK: Libomptarget fatal error 1: failure of target construct while offloading is mandatory
+#pragma omp target data map(alloc: arr[SMALL])
+  {
+#pragma omp target map(present, tofrom: arr[LARGE])
+    ;
+  }
+
+  // CHECK-NOT: arr is present
+  fprintf(stderr, "arr is present\n");
+
+  return 0;
+}
diff --git a/libomptarget/test/mapping/present/target_data.c b/libomptarget/test/mapping/present/target_data.c
new file mode 100644
index 000000000..fd3107d87
--- /dev/null
+++ b/libomptarget/test/mapping/present/target_data.c
@@ -0,0 +1,42 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+
+int main() {
+  int i;
+
+  // CHECK: addr=0x[[#%x,HOST_ADDR:]], size=[[#%u,SIZE:]]
+  fprintf(stderr, "addr=%p, size=%ld\n", &i, sizeof i);
+
+  // CHECK-NOT: Libomptarget
+#pragma omp target data map(alloc: i)
+#pragma omp target data map(present, alloc: i)
+  ;
+
+  // CHECK: i is present
+  fprintf(stderr, "i is present\n");
+
+  // CHECK: Libomptarget message: device mapping required by 'present' map type modifier does not exist for host address 0x{{0*}}[[#HOST_ADDR]] ([[#SIZE]] bytes)
+  // CHECK: Libomptarget fatal error 1: failure of target construct while offloading is mandatory
+#pragma omp target data map(present, alloc: i)
+  ;
+
+  // CHECK-NOT: i is present
+  fprintf(stderr, "i is present\n");
+
+  return 0;
+}
diff --git a/libomptarget/test/mapping/present/target_data_array_extension.c b/libomptarget/test/mapping/present/target_data_array_extension.c
new file mode 100644
index 000000000..b5e43c2e1
--- /dev/null
+++ b/libomptarget/test/mapping/present/target_data_array_extension.c
@@ -0,0 +1,113 @@
+// --------------------------------------------------
+// Check extends before
+// --------------------------------------------------
+
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-fail-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-fail-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-fail-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+// --------------------------------------------------
+// Check extends after
+// --------------------------------------------------
+
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=AFTER
+// RUN: %libomptarget-run-fail-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=AFTER
+// RUN: %libomptarget-run-fail-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=AFTER
+// RUN: %libomptarget-run-fail-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=AFTER
+// RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+// END.
+
+#include <stdio.h>
+
+#define BEFORE 0
+#define AFTER  1
+
+#define SIZE 100
+
+#if EXTENDS == BEFORE
+# define SMALL_BEG (SIZE-2)
+# define SMALL_END SIZE
+# define LARGE_BEG 0
+# define LARGE_END SIZE
+#elif EXTENDS == AFTER
+# define SMALL_BEG 0
+# define SMALL_END 2
+# define LARGE_BEG 0
+# define LARGE_END SIZE
+#else
+# error EXTENDS undefined
+#endif
+
+#define SMALL_SIZE (SMALL_END-SMALL_BEG)
+#define LARGE_SIZE (LARGE_END-LARGE_BEG)
+
+#define SMALL SMALL_BEG:SMALL_SIZE
+#define LARGE LARGE_BEG:LARGE_SIZE
+
+int main() {
+  int arr[SIZE];
+
+  // CHECK: addr=0x[[#%x,SMALL_ADDR:]], size=[[#%u,SMALL_BYTES:]]
+  fprintf(stderr, "addr=%p, size=%ld\n", &arr[SMALL_BEG],
+          SMALL_SIZE * sizeof arr[0]);
+
+  // CHECK: addr=0x[[#%x,LARGE_ADDR:]], size=[[#%u,LARGE_BYTES:]]
+  fprintf(stderr, "addr=%p, size=%ld\n", &arr[LARGE_BEG],
+          LARGE_SIZE * sizeof arr[0]);
+
+  // CHECK-NOT: Libomptarget
+#pragma omp target data map(alloc: arr[LARGE])
+  {
+#pragma omp target data map(present, tofrom: arr[SMALL])
+    ;
+  }
+
+  // CHECK: arr is present
+  fprintf(stderr, "arr is present\n");
+
+  // CHECK: Libomptarget message: explicit extension not allowed: host address specified is 0x{{0*}}[[#LARGE_ADDR]] ([[#LARGE_BYTES]] bytes), but device allocation maps to host at 0x{{0*}}[[#SMALL_ADDR]] ([[#SMALL_BYTES]] bytes)
+  // CHECK: Libomptarget message: device mapping required by 'present' map type modifier does not exist for host address 0x{{0*}}[[#LARGE_ADDR]] ([[#LARGE_BYTES]] bytes)
+  // CHECK: Libomptarget error: Call to getOrAllocTgtPtr returned null pointer ('present' map type modifier).
+  // CHECK: Libomptarget fatal error 1: failure of target construct while offloading is mandatory
+#pragma omp target data map(alloc: arr[SMALL])
+  {
+#pragma omp target data map(present, tofrom: arr[LARGE])
+    ;
+  }
+
+  // CHECK-NOT: arr is present
+  fprintf(stderr, "arr is present\n");
+
+  return 0;
+}
diff --git a/libomptarget/test/mapping/present/target_data_at_exit.c b/libomptarget/test/mapping/present/target_data_at_exit.c
new file mode 100644
index 000000000..9a258ba57
--- /dev/null
+++ b/libomptarget/test/mapping/present/target_data_at_exit.c
@@ -0,0 +1,37 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+
+int main() {
+  int i;
+
+#pragma omp target enter data map(alloc:i)
+
+  // i isn't present at the end of the target data region, but the "present"
+  // modifier is only checked at the beginning of a region.
+#pragma omp target data map(present, alloc: i)
+  {
+#pragma omp target exit data map(delete:i)
+  }
+
+  // CHECK-NOT: Libomptarget
+  // CHECK: success
+  // CHECK-NOT: Libomptarget
+  fprintf(stderr, "success\n");
+
+  return 0;
+}
diff --git a/libomptarget/test/mapping/present/target_enter_data.c b/libomptarget/test/mapping/present/target_enter_data.c
new file mode 100644
index 000000000..dfe54ffbb
--- /dev/null
+++ b/libomptarget/test/mapping/present/target_enter_data.c
@@ -0,0 +1,42 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+
+int main() {
+  int i;
+
+  // CHECK: addr=0x[[#%x,HOST_ADDR:]], size=[[#%u,SIZE:]]
+  fprintf(stderr, "addr=%p, size=%ld\n", &i, sizeof i);
+
+  // CHECK-NOT: Libomptarget
+#pragma omp target enter data map(alloc: i)
+#pragma omp target enter data map(present, alloc: i)
+#pragma omp target exit data map(delete: i)
+
+  // CHECK: i is present
+  fprintf(stderr, "i is present\n");
+
+  // CHECK: Libomptarget message: device mapping required by 'present' map type modifier does not exist for host address 0x{{0*}}[[#HOST_ADDR]] ([[#SIZE]] bytes)
+  // CHECK: Libomptarget error: Call to getOrAllocTgtPtr returned null pointer ('present' map type modifier).
+  // CHECK: Libomptarget fatal error 1: failure of target construct while offloading is mandatory
+#pragma omp target enter data map(present, alloc: i)
+
+  // CHECK-NOT: i is present
+  fprintf(stderr, "i is present\n");
+
+  return 0;
+}
diff --git a/libomptarget/test/mapping/present/target_exit_data.c b/libomptarget/test/mapping/present/target_exit_data.c
new file mode 100644
index 000000000..86b7ad89c
--- /dev/null
+++ b/libomptarget/test/mapping/present/target_exit_data.c
@@ -0,0 +1,40 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+
+int main() {
+  int i;
+
+  // CHECK: addr=0x[[#%x,HOST_ADDR:]], size=[[#%u,SIZE:]]
+  fprintf(stderr, "addr=%p, size=%ld\n", &i, sizeof i);
+
+  // CHECK-NOT: Libomptarget
+#pragma omp target enter data map(alloc: i)
+#pragma omp target exit data map(present, release: i)
+
+  // CHECK: i is present
+  fprintf(stderr, "i is present\n");
+
+  // CHECK: Libomptarget message: device mapping required by 'present' map type modifier does not exist for host address 0x{{0*}}[[#HOST_ADDR]] ([[#SIZE]] bytes)
+  // CHECK: Libomptarget fatal error 1: failure of target construct while offloading is mandatory
+#pragma omp target exit data map(present, release: i)
+
+  // CHECK-NOT: i is present
+  fprintf(stderr, "i is present\n");
+
+  return 0;
+}
diff --git a/libomptarget/test/mapping/present/target_update.c b/libomptarget/test/mapping/present/target_update.c
new file mode 100644
index 000000000..3a3575a27
--- /dev/null
+++ b/libomptarget/test/mapping/present/target_update.c
@@ -0,0 +1,73 @@
+// --------------------------------------------------
+// Check 'to'
+// --------------------------------------------------
+
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=to
+// RUN: %libomptarget-run-fail-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=to
+// RUN: %libomptarget-run-fail-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=to
+// RUN: %libomptarget-run-fail-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=to
+// RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+// --------------------------------------------------
+// Check 'from'
+// --------------------------------------------------
+
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=from
+// RUN: %libomptarget-run-fail-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=from
+// RUN: %libomptarget-run-fail-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=from
+// RUN: %libomptarget-run-fail-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=from
+// RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+
+int main() {
+  int i;
+
+  // CHECK: addr=0x[[#%x,HOST_ADDR:]], size=[[#%u,SIZE:]]
+  fprintf(stderr, "addr=%p, size=%ld\n", &i, sizeof i);
+
+  // CHECK-NOT: Libomptarget
+#pragma omp target enter data map(alloc: i)
+#pragma omp target update CLAUSE(present: i)
+#pragma omp target exit data map(delete: i)
+
+  // CHECK: i is present
+  fprintf(stderr, "i is present\n");
+
+  // CHECK: Libomptarget message: device mapping required by 'present' motion modifier does not exist for host address 0x{{0*}}[[#HOST_ADDR]] ([[#SIZE]] bytes)
+  // CHECK: Libomptarget fatal error 1: failure of target construct while offloading is mandatory
+#pragma omp target update CLAUSE(present: i)
+
+  // CHECK-NOT: i is present
+  fprintf(stderr, "i is present\n");
+
+  return 0;
+}
diff --git a/libomptarget/test/mapping/present/target_update_array_extension.c b/libomptarget/test/mapping/present/target_update_array_extension.c
new file mode 100644
index 000000000..3e90c40c1
--- /dev/null
+++ b/libomptarget/test/mapping/present/target_update_array_extension.c
@@ -0,0 +1,140 @@
+// --------------------------------------------------
+// Check 'to' and extends before
+// --------------------------------------------------
+
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=to -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-fail-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=to -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-fail-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=to -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-fail-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=to -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+// --------------------------------------------------
+// Check 'from' and extends before
+// --------------------------------------------------
+
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=from -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-fail-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=from -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-fail-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=from -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-fail-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=from -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+// --------------------------------------------------
+// Check 'to' and extends after
+// --------------------------------------------------
+
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=to -DEXTENDS=AFTER
+// RUN: %libomptarget-run-fail-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=to -DEXTENDS=AFTER
+// RUN: %libomptarget-run-fail-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=to -DEXTENDS=AFTER
+// RUN: %libomptarget-run-fail-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=to -DEXTENDS=AFTER
+// RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+// --------------------------------------------------
+// Check 'from' and extends after
+// --------------------------------------------------
+
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=from -DEXTENDS=AFTER
+// RUN: %libomptarget-run-fail-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=from -DEXTENDS=AFTER
+// RUN: %libomptarget-run-fail-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=from -DEXTENDS=AFTER
+// RUN: %libomptarget-run-fail-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu \
+// RUN:   -fopenmp-version=51 -DCLAUSE=from -DEXTENDS=AFTER
+// RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+// END.
+
+#include <stdio.h>
+
+#define BEFORE 0
+#define AFTER  1
+
+#if EXTENDS == BEFORE
+# define SMALL 2:3
+# define LARGE 0:5
+#elif EXTENDS == AFTER
+# define SMALL 0:3
+# define LARGE 0:5
+#else
+# error EXTENDS undefined
+#endif
+
+int main() {
+  int arr[5];
+
+  // CHECK: addr=0x[[#%x,HOST_ADDR:]], size=[[#%u,SIZE:]]
+  fprintf(stderr, "addr=%p, size=%ld\n", arr, sizeof arr);
+
+  // CHECK-NOT: Libomptarget
+#pragma omp target data map(alloc: arr[LARGE])
+  {
+#pragma omp target update CLAUSE(present: arr[SMALL])
+  }
+
+  // CHECK: arr is present
+  fprintf(stderr, "arr is present\n");
+
+  // CHECK: Libomptarget message: device mapping required by 'present' motion modifier does not exist for host address 0x{{0*}}[[#HOST_ADDR]] ([[#SIZE]] bytes)
+  // CHECK: Libomptarget fatal error 1: failure of target construct while offloading is mandatory
+#pragma omp target data map(alloc: arr[SMALL])
+  {
+#pragma omp target update CLAUSE(present: arr[LARGE])
+  }
+
+  // CHECK-NOT: arr is present
+  fprintf(stderr, "arr is present\n");
+
+  return 0;
+}
diff --git a/libomptarget/test/mapping/present/unified_shared_memory.c b/libomptarget/test/mapping/present/unified_shared_memory.c
new file mode 100644
index 000000000..22d874601
--- /dev/null
+++ b/libomptarget/test/mapping/present/unified_shared_memory.c
@@ -0,0 +1,41 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+
+// The runtime considers unified shared memory to be always present.
+#pragma omp requires unified_shared_memory
+
+int main() {
+  int i;
+
+  // CHECK-NOT: Libomptarget
+#pragma omp target data map(alloc: i)
+#pragma omp target map(present, alloc: i)
+  ;
+
+  // CHECK: i is present
+  fprintf(stderr, "i is present\n");
+
+  // CHECK-NOT: Libomptarget
+#pragma omp target map(present, alloc: i)
+  ;
+
+  // CHECK: is present
+  fprintf(stderr, "i is present\n");
+
+  return 0;
+}
diff --git a/libomptarget/test/mapping/present/zero_length_array_section.c b/libomptarget/test/mapping/present/zero_length_array_section.c
new file mode 100644
index 000000000..3da0ab49b
--- /dev/null
+++ b/libomptarget/test/mapping/present/zero_length_array_section.c
@@ -0,0 +1,48 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+
+int main() {
+  int arr[5];
+
+  // CHECK: addr=0x[[#%x,HOST_ADDR:]]
+  fprintf(stderr, "addr=%p\n", arr);
+
+  // CHECK-NOT: Libomptarget
+#pragma omp target data map(alloc: arr[0:5])
+#pragma omp target map(present, alloc: arr[0:0])
+  ;
+
+  // CHECK: arr is present
+  fprintf(stderr, "arr is present\n");
+
+  // arr[0:0] doesn't create an actual mapping in the first directive.
+  //
+  // CHECK: Libomptarget message: device mapping required by 'present' map type modifier does not exist for host address 0x{{0*}}[[#HOST_ADDR]] (0 bytes)
+  // CHECK: Libomptarget error: Call to getOrAllocTgtPtr returned null pointer ('present' map type modifier).
+  // CHECK: Libomptarget error: Call to targetDataBegin failed, abort target.
+  // CHECK: Libomptarget error: Failed to process data before launching the kernel.
+  // CHECK: Libomptarget fatal error 1: failure of target construct while offloading is mandatory
+#pragma omp target data map(alloc: arr[0:0])
+#pragma omp target map(present, alloc: arr[0:0])
+  ;
+
+  // CHECK-NOT: arr is present
+  fprintf(stderr, "arr is present\n");
+
+  return 0;
+}
diff --git a/libomptarget/test/mapping/present/zero_length_array_section_exit.c b/libomptarget/test/mapping/present/zero_length_array_section_exit.c
new file mode 100644
index 000000000..bedc6a272
--- /dev/null
+++ b/libomptarget/test/mapping/present/zero_length_array_section_exit.c
@@ -0,0 +1,43 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu -fopenmp-version=51
+// RUN: %libomptarget-run-fail-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+
+int main() {
+  int arr[5];
+
+  // CHECK: addr=0x[[#%x,HOST_ADDR:]]
+  fprintf(stderr, "addr=%p\n", arr);
+
+  // CHECK-NOT: Libomptarget
+#pragma omp target enter data map(alloc: arr[0:5])
+#pragma omp target exit data map(present, release: arr[0:0])
+
+  // CHECK: arr is present
+  fprintf(stderr, "arr is present\n");
+
+  // arr[0:0] doesn't create an actual mapping in the first directive.
+  //
+  // CHECK: Libomptarget message: device mapping required by 'present' map type modifier does not exist for host address 0x{{0*}}[[#HOST_ADDR]] (0 bytes)
+  // CHECK: Libomptarget fatal error 1: failure of target construct while offloading is mandatory
+#pragma omp target enter data map(alloc: arr[0:0])
+#pragma omp target exit data map(present, release: arr[0:0])
+
+  // CHECK-NOT: arr is present
+  fprintf(stderr, "arr is present\n");
+
+  return 0;
+}
diff --git a/libomptarget/test/mapping/private_mapping.c b/libomptarget/test/mapping/private_mapping.c
new file mode 100644
index 000000000..4578f942f
--- /dev/null
+++ b/libomptarget/test/mapping/private_mapping.c
@@ -0,0 +1,33 @@
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
+
+#include <assert.h>
+#include <stdio.h>
+
+int main() {
+  int data1[3] = {1}, data2[3] = {2}, data3[3] = {3};
+  int sum[16] = {0};
+#pragma omp target teams distribute parallel for map(tofrom                    \
+                                                     : sum)                    \
+    firstprivate(data1, data2, data3)
+  for (int i = 0; i < 16; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      sum[i] += data1[j];
+      sum[i] += data2[j];
+      sum[i] += data3[j];
+    }
+  }
+
+  for (int i = 0; i < 16; ++i) {
+    assert(sum[i] == 6);
+  }
+
+  printf("PASS\n");
+
+  return 0;
+}
+
+// CHECK: PASS
diff --git a/libomptarget/test/mapping/ptr_and_obj_motion.c b/libomptarget/test/mapping/ptr_and_obj_motion.c
new file mode 100644
index 000000000..b08734542
--- /dev/null
+++ b/libomptarget/test/mapping/ptr_and_obj_motion.c
@@ -0,0 +1,48 @@
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
+
+#include <stdio.h>
+
+typedef struct {
+  double *dataptr;
+  int dummy1;
+  int dummy2;
+} DV;
+
+void init(double vertexx[]) {
+  #pragma omp target map(vertexx[0:100])
+  {
+    printf("In init: %lf, expected 100.0\n", vertexx[77]);
+    vertexx[77] = 77.0;
+  }
+}
+
+void change(DV *dvptr) {
+  #pragma omp target map(dvptr->dataptr[0:100])
+  {
+    printf("In change: %lf, expected 77.0\n", dvptr->dataptr[77]);
+    dvptr->dataptr[77] += 1.0;
+  }
+}
+
+int main() {
+  double vertexx[100];
+  vertexx[77] = 100.0;
+
+  DV dv;
+  dv.dataptr = &vertexx[0];
+
+  #pragma omp target enter data map(to:vertexx[0:100])
+
+  init(vertexx);
+  change(&dv);
+
+  #pragma omp target exit data map(from:vertexx[0:100])
+
+  // CHECK: Final: 78.0
+  printf("Final: %lf\n", vertexx[77]);
+}
+
diff --git a/libomptarget/test/mapping/target_data_array_extension_at_exit.c b/libomptarget/test/mapping/target_data_array_extension_at_exit.c
new file mode 100644
index 000000000..89200ee0c
--- /dev/null
+++ b/libomptarget/test/mapping/target_data_array_extension_at_exit.c
@@ -0,0 +1,140 @@
+// --------------------------------------------------
+// Check extends before
+// --------------------------------------------------
+
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=BEFORE
+// XUN: %libomptarget-run-x86_64-pc-linux-gnu 2>&1 \
+// XUN: | %fcheck-x86_64-pc-linux-gnu
+
+// --------------------------------------------------
+// Check extends after
+// --------------------------------------------------
+
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=AFTER
+// RUN: %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=AFTER
+// RUN: %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=AFTER
+// RUN: %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu \
+// RUN:   -fopenmp-version=51 -DEXTENDS=AFTER
+// RUN: %libomptarget-run-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+// END.
+
+#include <stdio.h>
+
+#define BEFORE 0
+#define AFTER  1
+
+#define SIZE 100
+
+#if EXTENDS == BEFORE
+# define SMALL_BEG (SIZE-2)
+# define SMALL_END SIZE
+# define LARGE_BEG 0
+# define LARGE_END SIZE
+#elif EXTENDS == AFTER
+# define SMALL_BEG 0
+# define SMALL_END 2
+# define LARGE_BEG 0
+# define LARGE_END SIZE
+#else
+# error EXTENDS undefined
+#endif
+
+#define SMALL SMALL_BEG:(SMALL_END-SMALL_BEG)
+#define LARGE LARGE_BEG:(LARGE_END-LARGE_BEG)
+
+void check_not_present() {
+  int arr[SIZE];
+
+  for (int i = 0; i < SIZE; ++i)
+    arr[i] = 99;
+
+  // CHECK-LABEL: checking not present
+  fprintf(stderr, "checking not present\n");
+
+  // arr[LARGE] isn't (fully) present at the end of the target data region, so
+  // the device-to-host transfer should not be performed, or it might fail.
+#pragma omp target data map(tofrom: arr[LARGE])
+  {
+#pragma omp target exit data map(delete: arr[LARGE])
+#pragma omp target enter data map(alloc: arr[SMALL])
+#pragma omp target map(alloc: arr[SMALL])
+    for (int i = SMALL_BEG; i < SMALL_END; ++i)
+      arr[i] = 88;
+  }
+
+  // CHECK-NOT: Libomptarget
+  // CHECK-NOT: error
+  for (int i = 0; i < SIZE; ++i) {
+    if (arr[i] != 99)
+      fprintf(stderr, "error: arr[%d]=%d\n", i, arr[i]);
+  }
+}
+
+void check_is_present() {
+  int arr[SIZE];
+
+  for (int i = 0; i < SIZE; ++i)
+    arr[i] = 99;
+
+  // CHECK-LABEL: checking is present
+  fprintf(stderr, "checking is present\n");
+
+  // arr[SMALL] is (fully) present at the end of the target data region, and the
+  // device-to-host transfer should be performed only for it even though more
+  // of the array is then present.
+#pragma omp target data map(tofrom: arr[SMALL])
+  {
+#pragma omp target exit data map(delete: arr[SMALL])
+#pragma omp target enter data map(alloc: arr[LARGE])
+#pragma omp target map(alloc: arr[LARGE])
+    for (int i = LARGE_BEG; i < LARGE_END; ++i)
+      arr[i] = 88;
+  }
+
+  // CHECK-NOT: Libomptarget
+  // CHECK-NOT: error
+  for (int i = 0; i < SIZE; ++i) {
+    if (SMALL_BEG <= i && i < SMALL_END) {
+      if (arr[i] != 88)
+        fprintf(stderr, "error: arr[%d]=%d\n", i, arr[i]);
+    } else if (arr[i] != 99) {
+      fprintf(stderr, "error: arr[%d]=%d\n", i, arr[i]);
+    }
+  }
+}
+
+int main() {
+  check_not_present();
+  check_is_present();
+  return 0;
+}
diff --git a/libomptarget/test/mapping/target_implicit_partial_map.c b/libomptarget/test/mapping/target_implicit_partial_map.c
new file mode 100644
index 000000000..aa713ff0f
--- /dev/null
+++ b/libomptarget/test/mapping/target_implicit_partial_map.c
@@ -0,0 +1,39 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu
+// RUN: %libomptarget-run-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+//
+// END.
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int arr[100];
+
+#pragma omp target data map(alloc: arr[50:2]) // partially mapped
+  {
+#pragma omp target // would implicitly map with full size but already present
+    {
+      arr[50] = 5;
+      arr[51] = 6;
+    } // must treat as present (dec ref count) even though full size not present
+  } // wouldn't delete if previous ref count dec didn't happen
+
+  // CHECK: still present: 0
+  fprintf(stderr, "still present: %d\n",
+          omp_target_is_present(&arr[50], omp_get_default_device()));
+
+  return 0;
+}
diff --git a/libomptarget/test/mapping/target_update_array_extension.c b/libomptarget/test/mapping/target_update_array_extension.c
new file mode 100644
index 000000000..f5748ce90
--- /dev/null
+++ b/libomptarget/test/mapping/target_update_array_extension.c
@@ -0,0 +1,136 @@
+// --------------------------------------------------
+// Check 'to' and extends before
+// --------------------------------------------------
+
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu \
+// RUN:   -DCLAUSE=to -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu \
+// RUN:   -DCLAUSE=to -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu \
+// RUN:   -DCLAUSE=to -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu \
+// RUN:   -DCLAUSE=to -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+// --------------------------------------------------
+// Check 'from' and extends before
+// --------------------------------------------------
+
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu \
+// RUN:   -DCLAUSE=from -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu \
+// RUN:   -DCLAUSE=from -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu \
+// RUN:   -DCLAUSE=from -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu \
+// RUN:   -DCLAUSE=from -DEXTENDS=BEFORE
+// RUN: %libomptarget-run-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+// --------------------------------------------------
+// Check 'to' and extends after
+// --------------------------------------------------
+
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu \
+// RUN:   -DCLAUSE=to -DEXTENDS=AFTER
+// RUN: %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu \
+// RUN:   -DCLAUSE=to -DEXTENDS=AFTER
+// RUN: %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu \
+// RUN:   -DCLAUSE=to -DEXTENDS=AFTER
+// RUN: %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu \
+// RUN:   -DCLAUSE=to -DEXTENDS=AFTER
+// RUN: %libomptarget-run-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+// --------------------------------------------------
+// Check 'from' and extends after
+// --------------------------------------------------
+
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu \
+// RUN:   -DCLAUSE=from -DEXTENDS=AFTER
+// RUN: %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 \
+// RUN: | %fcheck-aarch64-unknown-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu \
+// RUN:   -DCLAUSE=from -DEXTENDS=AFTER
+// RUN: %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu \
+// RUN:   -DCLAUSE=from -DEXTENDS=AFTER
+// RUN: %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 \
+// RUN: | %fcheck-powerpc64le-ibm-linux-gnu
+
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu \
+// RUN:   -DCLAUSE=from -DEXTENDS=AFTER
+// RUN: %libomptarget-run-x86_64-pc-linux-gnu 2>&1 \
+// RUN: | %fcheck-x86_64-pc-linux-gnu
+
+// END.
+
+#include <stdio.h>
+
+#define BEFORE 0
+#define AFTER  1
+
+#if EXTENDS == BEFORE
+# define SMALL 2:3
+# define LARGE 0:5
+#elif EXTENDS == AFTER
+# define SMALL 0:3
+# define LARGE 0:5
+#else
+# error EXTENDS undefined
+#endif
+
+int main() {
+  int arr[5];
+
+  // CHECK-NOT: Libomptarget
+#pragma omp target data map(alloc: arr[LARGE])
+  {
+#pragma omp target update CLAUSE(arr[SMALL])
+  }
+
+  // CHECK: success
+  fprintf(stderr, "success\n");
+
+  // CHECK-NOT: Libomptarget
+#pragma omp target data map(alloc: arr[SMALL])
+  {
+#pragma omp target update CLAUSE(arr[LARGE])
+  }
+
+  // CHECK: success
+  fprintf(stderr, "success\n");
+
+  return 0;
+}
diff --git a/libomptarget/test/offloading/bug47654.cpp b/libomptarget/test/offloading/bug47654.cpp
new file mode 100644
index 000000000..14c8a9dde
--- /dev/null
+++ b/libomptarget/test/offloading/bug47654.cpp
@@ -0,0 +1,29 @@
+// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
+
+#include <cassert>
+#include <iostream>
+
+int main(int argc, char *argv[]) {
+  int i = 0, j = 0;
+
+#pragma omp target map(tofrom : i, j) nowait
+  {
+    i = 1;
+    j = 2;
+  }
+
+#pragma omp taskwait
+
+  assert(i == 1);
+  assert(j == 2);
+
+  std::cout << "PASS\n";
+
+  return 0;
+}
+
+// CHECK: PASS
diff --git a/libomptarget/test/offloading/d2d_memcpy.c b/libomptarget/test/offloading/d2d_memcpy.c
new file mode 100644
index 000000000..968f7112c
--- /dev/null
+++ b/libomptarget/test/offloading/d2d_memcpy.c
@@ -0,0 +1,68 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-aarch64-unknown-linux-gnu | %fcheck-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64-ibm-linux-gnu | %fcheck-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64le-ibm-linux-gnu | %fcheck-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-x86_64-pc-linux-gnu | %fcheck-x86_64-pc-linux-gnu -allow-empty
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda -allow-empty
+
+#include <assert.h>
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+const int magic_num = 7;
+
+int main(int argc, char *argv[]) {
+  const int N = 128;
+  const int num_devices = omp_get_num_devices();
+
+  // No target device, just return
+  if (num_devices == 0) {
+    printf("PASS\n");
+    return 0;
+  }
+
+  const int src_device = 0;
+  int dst_device = num_devices - 1;
+
+  int length = N * sizeof(int);
+  int *src_ptr = omp_target_alloc(length, src_device);
+  int *dst_ptr = omp_target_alloc(length, dst_device);
+
+  assert(src_ptr && "src_ptr is NULL");
+  assert(dst_ptr && "dst_ptr is NULL");
+
+#pragma omp target teams distribute parallel for device(src_device) \
+                   is_device_ptr(src_ptr)
+  for (int i = 0; i < N; ++i) {
+    src_ptr[i] = magic_num;
+  }
+
+  int rc =
+      omp_target_memcpy(dst_ptr, src_ptr, length, 0, 0, dst_device, src_device);
+
+  assert(rc == 0 && "error in omp_target_memcpy");
+
+  int *buffer = malloc(length);
+
+  assert(buffer && "failed to allocate host buffer");
+
+#pragma omp target teams distribute parallel for device(dst_device) \
+                   map(from: buffer[0:N]) is_device_ptr(dst_ptr)
+  for (int i = 0; i < N; ++i) {
+    buffer[i] = dst_ptr[i] + magic_num;
+  }
+
+  for (int i = 0; i < N; ++i)
+    assert(buffer[i] == 2 * magic_num);
+
+  printf("PASS\n");
+
+  // Free host and device memory
+  free(buffer);
+  omp_target_free(src_ptr, src_device);
+  omp_target_free(dst_ptr, dst_device);
+
+  return 0;
+}
+
+// CHECK: PASS
diff --git a/libomptarget/test/offloading/dynamic_module.c b/libomptarget/test/offloading/dynamic_module.c
new file mode 100644
index 000000000..ba3c025ba
--- /dev/null
+++ b/libomptarget/test/offloading/dynamic_module.c
@@ -0,0 +1,18 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu -DSHARED -fPIC -shared -o %t.so && %libomptarget-compile-aarch64-unknown-linux-gnu %t.so && %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -DSHARED -fPIC -shared -o %t.so && %libomptarget-compile-powerpc64-ibm-linux-gnu %t.so && %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -DSHARED -fPIC -shared -o %t.so && %libomptarget-compile-powerpc64le-ibm-linux-gnu %t.so && %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu -DSHARED -fPIC -shared -o %t.so && %libomptarget-compile-x86_64-pc-linux-gnu %t.so && %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -DSHARED -fPIC -shared -o %t.so && %libomptarget-compile-nvptx64-nvidia-cuda %t.so && %libomptarget-run-nvptx64-nvidia-cuda 2>&1 | %fcheck-nvptx64-nvidia-cuda
+
+#ifdef SHARED
+void foo() {}
+#else
+#include <stdio.h>
+int main() {
+#pragma omp target
+  ;
+  // CHECK: DONE.
+  printf("%s\n", "DONE.");
+  return 0;
+}
+#endif
diff --git a/libomptarget/test/offloading/dynamic_module_load.c b/libomptarget/test/offloading/dynamic_module_load.c
new file mode 100644
index 000000000..b58d07f3e
--- /dev/null
+++ b/libomptarget/test/offloading/dynamic_module_load.c
@@ -0,0 +1,38 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu -DSHARED -fPIC -shared -o %t.so && %clang %flags %s -o %t-aarch64-unknown-linux-gnu -ldl && %libomptarget-run-aarch64-unknown-linux-gnu %t.so 2>&1 | %fcheck-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -DSHARED -fPIC -shared -o %t.so && %clang %flags %s -o %t-powerpc64-ibm-linux-gnu -ldl && %libomptarget-run-powerpc64-ibm-linux-gnu %t.so 2>&1 | %fcheck-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -DSHARED -fPIC -shared -o %t.so && %clang %flags %s -o %t-powerpc64le-ibm-linux-gnu -ldl && %libomptarget-run-powerpc64le-ibm-linux-gnu %t.so 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu -DSHARED -fPIC -shared -o %t.so && %clang %flags %s -o %t-x86_64-pc-linux-gnu -ldl && %libomptarget-run-x86_64-pc-linux-gnu %t.so 2>&1 | %fcheck-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -DSHARED -fPIC -shared -o %t.so && %clang %flags %s -o %t-nvptx64-nvidia-cuda -ldl && %libomptarget-run-nvptx64-nvidia-cuda %t.so 2>&1 | %fcheck-nvptx64-nvidia-cuda
+
+// [BOLT] It takes too long time.  Let's remove this from tests.
+// UNSUPPORTED: clang-11
+
+#ifdef SHARED
+#include <stdio.h>
+int foo() {
+#pragma omp target
+  ;
+  printf("%s\n", "DONE.");
+  return 0;
+}
+#else
+#include <dlfcn.h>
+#include <stdio.h>
+int main(int argc, char **argv) {
+  void *Handle = dlopen(argv[1], RTLD_NOW);
+  int (*Foo)(void);
+
+  if (Handle == NULL) {
+    printf("dlopen() failed: %s\n", dlerror());
+    return 1;
+  }
+  Foo = (int (*)(void)) dlsym(Handle, "foo");
+  if (Handle == NULL) {
+    printf("dlsym() failed: %s\n", dlerror());
+    return 1;
+  }
+  // CHECK: DONE.
+  // CHECK-NOT: {{abort|fault}}
+  return Foo();
+}
+#endif
diff --git a/libomptarget/test/offloading/info.c b/libomptarget/test/offloading/info.c
new file mode 100644
index 000000000..e04f9ccaa
--- /dev/null
+++ b/libomptarget/test/offloading/info.c
@@ -0,0 +1,38 @@
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -gline-tables-only && env LIBOMPTARGET_INFO=23 %libomptarget-run-nvptx64-nvidia-cuda 2>&1 | %fcheck-nvptx64-nvidia-cuda -allow-empty -check-prefix=INFO
+
+#include <stdio.h>
+#include <omp.h>
+
+#define N 64
+
+int main() {
+  int A[N];
+  int B[N];
+  int C[N];
+  int val = 1;
+
+// INFO: CUDA device 0 info: Device supports up to {{.*}} CUDA blocks and {{.*}} threads with a warp size of {{.*}}
+// INFO: Libomptarget device 0 info: Entering OpenMP data region at info.c:33:1 with 3 arguments:
+// INFO: Libomptarget device 0 info: alloc(A[0:64])[256]
+// INFO: Libomptarget device 0 info: tofrom(B[0:64])[256]
+// INFO: Libomptarget device 0 info: to(C[0:64])[256]
+// INFO: Libomptarget device 0 info: OpenMP Host-Device pointer mappings after block at info.c:33:1:
+// INFO: Libomptarget device 0 info: Host Ptr           Target Ptr         Size (B) RefCount Declaration
+// INFO: Libomptarget device 0 info: {{.*}} {{.*}} 256      1        C[0:64] at info.c:11:7
+// INFO: Libomptarget device 0 info: {{.*}} {{.*}} 256      1        B[0:64] at info.c:10:7
+// INFO: Libomptarget device 0 info: {{.*}} {{.*}} 256      1        A[0:64] at info.c:9:7
+// INFO: Libomptarget device 0 info: Entering OpenMP kernel at info.c:34:1 with 1 arguments:
+// INFO: Libomptarget device 0 info: firstprivate(val)[4]
+// INFO: CUDA device 0 info: Launching kernel {{.*}} with {{.*}} and {{.*}} threads in {{.*}} mode
+// INFO: Libomptarget device 0 info: OpenMP Host-Device pointer mappings after block at info.c:34:1:
+// INFO: Libomptarget device 0 info: Host Ptr           Target Ptr         Size (B) RefCount Declaration
+// INFO: Libomptarget device 0 info: 0x{{.*}} 0x{{.*}} 256      1        C[0:64] at info.c:11:7
+// INFO: Libomptarget device 0 info: 0x{{.*}} 0x{{.*}} 256      1        B[0:64] at info.c:10:7
+// INFO: Libomptarget device 0 info: 0x{{.*}} 0x{{.*}} 256      1        A[0:64] at info.c:9:7
+// INFO: Libomptarget device 0 info: Exiting OpenMP data region at info.c:33:1
+#pragma omp target data map(alloc:A[0:N]) map(tofrom:B[0:N]) map(to:C[0:N])
+#pragma omp target firstprivate(val)
+  { val = 1; }
+
+  return 0;
+}
diff --git a/libomptarget/test/offloading/looptripcnt.c b/libomptarget/test/offloading/looptripcnt.c
index 025231b0c..d4c3d6013 100644
--- a/libomptarget/test/offloading/looptripcnt.c
+++ b/libomptarget/test/offloading/looptripcnt.c
@@ -2,6 +2,7 @@
 // RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
 // RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
 // RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-nvptx64-nvidia-cuda 2>&1 | %fcheck-nvptx64-nvidia-cuda -allow-empty -check-prefix=DEBUG
 // REQUIRES: libomptarget-debug
 
 /*
diff --git a/libomptarget/test/offloading/memory_manager.cpp b/libomptarget/test/offloading/memory_manager.cpp
new file mode 100644
index 000000000..1e9d5e4da
--- /dev/null
+++ b/libomptarget/test/offloading/memory_manager.cpp
@@ -0,0 +1,47 @@
+// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
+
+#include <omp.h>
+
+#include <cassert>
+#include <iostream>
+
+int main(int argc, char *argv[]) {
+#pragma omp parallel for
+  for (int i = 0; i < 16; ++i) {
+    for (int n = 1; n < (1 << 13); n <<= 1) {
+      void *p = omp_target_alloc(n * sizeof(int), 0);
+      omp_target_free(p, 0);
+    }
+  }
+
+#pragma omp parallel for
+  for (int i = 0; i < 16; ++i) {
+    for (int n = 1; n < (1 << 13); n <<= 1) {
+      int *p = (int *)omp_target_alloc(n * sizeof(int), 0);
+#pragma omp target teams distribute parallel for is_device_ptr(p)
+      for (int j = 0; j < n; ++j) {
+        p[j] = i;
+      }
+      int buffer[n];
+#pragma omp target teams distribute parallel for is_device_ptr(p)              \
+    map(from                                                                   \
+        : buffer)
+      for (int j = 0; j < n; ++j) {
+        buffer[j] = p[j];
+      }
+      for (int j = 0; j < n; ++j) {
+        assert(buffer[j] == i);
+      }
+      omp_target_free(p, 0);
+    }
+  }
+
+  std::cout << "PASS\n";
+  return 0;
+}
+
+// CHECK: PASS
diff --git a/libomptarget/test/offloading/non_contiguous_update.cpp b/libomptarget/test/offloading/non_contiguous_update.cpp
new file mode 100644
index 000000000..e2f1c4569
--- /dev/null
+++ b/libomptarget/test/offloading/non_contiguous_update.cpp
@@ -0,0 +1,101 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-nvptx64-nvidia-cuda 2>&1 | %fcheck-nvptx64-nvidia-cuda -allow-empty -check-prefix=DEBUG
+// REQUIRES: libomptarget-debug
+
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+
+// Data structure definitions copied from OpenMP RTL.
+struct __tgt_target_non_contig {
+  int64_t offset;
+  int64_t width;
+  int64_t stride;
+};
+
+enum tgt_map_type {
+  OMP_TGT_MAPTYPE_NON_CONTIG      = 0x100000000000
+};
+
+// OpenMP RTL interfaces
+#ifdef __cplusplus
+extern "C" {
+#endif
+void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
+                              void **args_base, void **args, int64_t *arg_sizes,
+                              int64_t *arg_types);
+#ifdef __cplusplus
+}
+#endif
+
+int main() {
+  // case 1
+  // int arr[3][4][5][6];
+  // #pragma omp target update to(arr[0:2][1:3][1:2][:])
+  // set up descriptor
+  __tgt_target_non_contig non_contig[5] = {
+      {0, 2, 480}, {1, 3, 120}, {1, 2, 24}, {0, 6, 4}, {0, 1, 4}};
+  int64_t size = 4, type = OMP_TGT_MAPTYPE_NON_CONTIG;
+
+  void *base;
+  void *begin = &non_contig;
+  int64_t *sizes = &size;
+  int64_t *types = &type;
+
+  // The below diagram is the visualization of the non-contiguous transfer after
+  // optimization. Note that each element represent the innermost dimension
+  // (unit size = 24) since the stride * count of last dimension is equal to the
+  // stride of second last dimension.
+  //
+  // OOOOO OOOOO OOOOO
+  // OXXOO OXXOO OOOOO
+  // OXXOO OXXOO OOOOO
+  // OXXOO OXXOO OOOOO
+  __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base, &begin,
+                           sizes, types);
+  // DEBUG: offset 144
+  // DEBUG: offset 264
+  // DEBUG: offset 384
+  // DEBUG: offset 624
+  // DEBUG: offset 744
+  // DEBUG: offset 864
+
+
+  // case 2
+  // double darr[3][4][5];
+  // #pragma omp target update to(darr[0:2:2][2:2][:2:2])
+  // set up descriptor
+  __tgt_target_non_contig non_contig_2[4] = {
+      {0, 2, 320}, {2, 2, 40}, {0, 2, 16}, {0, 1, 8}};
+  int64_t size_2 = 4, type_2 = OMP_TGT_MAPTYPE_NON_CONTIG;
+
+  void *base_2;
+  void *begin_2 = &non_contig_2;
+  int64_t *sizes_2 = &size_2;
+  int64_t *types_2 = &type_2;
+
+  // The below diagram is the visualization of the non-contiguous transfer after
+  // optimization. Note that each element represent the innermost dimension
+  // (unit size = 24) since the stride * count of last dimension is equal to the
+  // stride of second last dimension.
+  //
+  // OOOOO OOOOO OOOOO
+  // OOOOO OOOOO OOOOO
+  // XOXOO OOOOO XOXOO
+  // XOXOO OOOOO XOXOO
+  __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base_2, &begin_2,
+                           sizes_2, types_2);
+  // DEBUG: offset 80
+  // DEBUG: offset 96
+  // DEBUG: offset 120
+  // DEBUG: offset 136
+  // DEBUG: offset 400
+  // DEBUG: offset 416
+  // DEBUG: offset 440
+  // DEBUG: offset 456
+  return 0;
+}
+
diff --git a/libomptarget/test/offloading/offloading_success.c b/libomptarget/test/offloading/offloading_success.c
index 12e78fac1..000a38cb5 100644
--- a/libomptarget/test/offloading/offloading_success.c
+++ b/libomptarget/test/offloading/offloading_success.c
@@ -2,6 +2,7 @@
 // RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
 // RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
 // RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
 
 #include <stdio.h>
 #include <omp.h>
diff --git a/libomptarget/test/offloading/offloading_success.cpp b/libomptarget/test/offloading/offloading_success.cpp
index eecd97a3f..910cb1790 100644
--- a/libomptarget/test/offloading/offloading_success.cpp
+++ b/libomptarget/test/offloading/offloading_success.cpp
@@ -2,6 +2,7 @@
 // RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
 // RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
 // RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
 
 #include <stdio.h>
 #include <omp.h>
diff --git a/libomptarget/test/offloading/parallel_offloading_map.cpp b/libomptarget/test/offloading/parallel_offloading_map.cpp
new file mode 100644
index 000000000..08c4470c1
--- /dev/null
+++ b/libomptarget/test/offloading/parallel_offloading_map.cpp
@@ -0,0 +1,43 @@
+// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
+
+#include <cassert>
+#include <iostream>
+
+int main(int argc, char *argv[]) {
+  constexpr const int num_threads = 8, N = 16;
+  int array[num_threads] = {0};
+
+#pragma omp parallel for
+  for (int i = 0; i < num_threads; ++i) {
+    int tmp[N];
+
+    for (int j = 0; j < N; ++j) {
+      tmp[j] = i;
+    }
+
+#pragma omp target teams distribute parallel for map(tofrom : tmp)
+    for (int j = 0; j < N; ++j) {
+      tmp[j] += j;
+    }
+
+    for (int j = 0; j < N; ++j) {
+      array[i] += tmp[j];
+    }
+  }
+
+  // Verify
+  for (int i = 0; i < num_threads; ++i) {
+    const int ref = (0 + N - 1) * N / 2 + i * N;
+    assert(array[i] == ref);
+  }
+
+  std::cout << "PASS\n";
+
+  return 0;
+}
+
+// CHECK: PASS
diff --git a/libomptarget/test/offloading/requires.c b/libomptarget/test/offloading/requires.c
index 079ce5cb9..60346b103 100644
--- a/libomptarget/test/offloading/requires.c
+++ b/libomptarget/test/offloading/requires.c
@@ -2,6 +2,7 @@
 // RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
 // RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
 // RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-nvptx64-nvidia-cuda 2>&1 | %fcheck-nvptx64-nvidia-cuda -allow-empty -check-prefix=DEBUG
 // REQUIRES: libomptarget-debug
 
 /*
diff --git a/libomptarget/test/offloading/target_depend_nowait.cpp b/libomptarget/test/offloading/target_depend_nowait.cpp
index 2c1c7e719..32a85d972 100644
--- a/libomptarget/test/offloading/target_depend_nowait.cpp
+++ b/libomptarget/test/offloading/target_depend_nowait.cpp
@@ -2,6 +2,7 @@
 // RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
 // RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
 // RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/libomptarget/test/unified_shared_memory/api.c b/libomptarget/test/unified_shared_memory/api.c
index b0a71ad35..a1c94c88d 100644
--- a/libomptarget/test/unified_shared_memory/api.c
+++ b/libomptarget/test/unified_shared_memory/api.c
@@ -33,8 +33,10 @@ int main(int argc, char *argv[]) {
   // that do not support requires.
   __tgt_register_requires(8);
 
-  // CHECK: Initial device: -10
+  // CHECK: Initial device: [[INITIAL_DEVICE:[0-9]+]]
   printf("Initial device: %d\n", omp_get_initial_device());
+  // CHECK: Num devices: [[INITIAL_DEVICE]]
+  printf("Num devices: %d\n", omp_get_num_devices());
 
   //
   // Target alloc & target memcpy
diff --git a/maint/bolt-release.pl b/maint/bolt-release.pl
new file mode 100755
index 000000000..8f04f7508
--- /dev/null
+++ b/maint/bolt-release.pl
@@ -0,0 +1,155 @@
+#!/usr/bin/env perl
+#
+# (C) 2016 by Argonne National Laboratory.
+#     See LICENSE.txt in top-level directory.
+#
+
+use strict;
+use warnings;
+
+use Cwd qw( cwd getcwd realpath );
+use Getopt::Long;
+use File::Temp qw( tempdir );
+
+my $arg = 0;
+my $branch = "";
+my $version = "";
+my $append_commit_id;
+my $root = cwd();
+my $git_repo = "";
+
+my $logfile = "release.log";
+
+sub usage
+{
+    print "Usage: $0 [OPTIONS]\n\n";
+    print "OPTIONS:\n";
+
+    print "\t--git-repo           path to root of the git repository (required)\n";
+    print "\t--branch             git branch to be packaged (required)\n";
+    print "\t--version            tarball version (required)\n";
+    print "\t--append-commit-id   append git commit description (optional)\n";
+
+    print "\n";
+
+    exit 1;
+}
+
+sub check_package
+{
+    my $pack = shift;
+
+    print "===> Checking for package $pack... ";
+    if (`which $pack` eq "") {
+        print "not found\n";
+        exit;
+    }
+    print "done\n";
+}
+
+# will also chdir to the top level of the git repository
+sub check_git_repo {
+    my $repo_path = shift;
+
+    print "===> chdir to $repo_path\n";
+    chdir $repo_path;
+
+    print "===> Checking git repository sanity... ";
+    unless (`git rev-parse --is-inside-work-tree 2> /dev/null` eq "true\n") {
+        print "ERROR: $repo_path is not a git repository\n";
+        exit 1;
+    }
+    # I'm not strictly sure that this is true, but it's not too burdensome right
+    # now to restrict it to complete (non-bare repositories).
+    unless (`git rev-parse --is-bare-repository 2> /dev/null` eq "false\n") {
+        print "ERROR: $repo_path is a *bare* repository (need working tree)\n";
+        exit 1;
+    }
+
+    print "done\n";
+}
+
+sub run_cmd
+{
+    my $cmd = shift;
+
+    #print("===> running cmd=|$cmd| from ".getcwd()."\n");
+    system("$cmd >> $root/$logfile 2>&1");
+    if ($?) {
+        die "unable to execute ($cmd), \$?=$?.  Stopped";
+    }
+}
+
+GetOptions(
+    "git-repo=s" => \$git_repo,
+    "branch=s" => \$branch,
+    "version=s" => \$version,
+    "append-commit-id!" => \$append_commit_id,
+    "help" => \&usage
+) or die "unable to parse options, stopped";
+
+if (scalar(@ARGV) != 0) {
+    usage();
+}
+
+if (!$branch || !$version || !$git_repo) {
+    usage();
+}
+
+check_package("git");
+check_package("cmake");
+print("\n");
+
+
+my $tdir = tempdir(CLEANUP => 1);
+my $local_git_clone = "${tdir}/bolt-clone";
+
+
+# clone git repo
+print("===> Cloning git repo... ");
+run_cmd("git clone --recursive -b ${branch} ${git_repo} ${local_git_clone}");
+print("done\n");
+
+# chdirs to $local_git_clone if valid
+check_git_repo($local_git_clone);
+print("\n");
+
+if ($append_commit_id) {
+    my $desc = `git describe --always ${branch}`;
+    chomp $desc;
+    $version .= "-${desc}";
+}
+
+my $expdir = "${tdir}/bolt-${version}";
+
+# Clean up the log file
+system("rm -f ${root}/$logfile");
+
+# Check out the appropriate branch
+print("===> Exporting code from git... ");
+run_cmd("rm -rf ${expdir}");
+run_cmd("mkdir -p ${expdir}");
+run_cmd("git archive ${branch} --prefix='bolt-${version}/' | tar -x -C $tdir");
+run_cmd("git submodule foreach --recursive \'git archive HEAD --prefix='' | tar -x -C `echo \${toplevel}/\${path} | sed -e s/clone/${version}/`'");
+print("done\n");
+
+# Remove unnecessary files
+print("===> Removing unnecessary files in the main codebase... ");
+chdir($expdir);
+run_cmd("find . -name .gitignore | xargs rm -rf");
+run_cmd("find . -name .gitmodules | xargs rm -rf");
+run_cmd("find . -name .tmp | xargs rm -rf");
+print("done\n");
+
+# TODO: Get docs
+
+# Create the main tarball
+print("===> Creating the final bolt tarball... ");
+chdir("${tdir}");
+run_cmd("tar -czvf bolt-${version}.tar.gz bolt-${version}");
+run_cmd("cp -a bolt-${version}.tar.gz ${root}/");
+print("done\n");
+
+# make sure we are outside of the tempdir so that the CLEANUP logic can run
+chdir("${tdir}/..");
+
diff --git a/maint/update-llvmomp.sh b/maint/update-llvmomp.sh
new file mode 100755
index 000000000..3e0a46baf
--- /dev/null
+++ b/maint/update-llvmomp.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+boltpath="$(pwd)"
+
+# Switch to the llvmomp branch
+git checkout llvmomp
+
+# Get the latest llvm-project
+if [ -d "${boltpath}/llvm-project" ]; then
+  cd "${boltpath}/llvm-project"
+  git pull
+  git checkout master
+else
+  git clone https://github.com/llvm/llvm-project.git "${boltpath}/llvm-project"
+fi
+
+# Find the corresponding latest commit of the llvmomp branch
+cd "$boltpath"
+if [ x"$(git log HEAD~..HEAD | grep cherry-pick:)" = x ]; then
+  # The last commit in llvm-mirror: d69d1aa131b4cf339bfac116e50da33a5f94b861
+  commit_id_begin="d69d1aa131b4cf339bfac116e50da33a5f94b861"
+else
+  # Extract the first "cherry-pick:" comment from the latest log.
+  commit_id_begin=$(git log HEAD~..HEAD | grep -Po "cherry-pick: \K[\w]+" | head -n 1)
+fi
+echo "last commit: $commit_id_begin"
+
+rm -rf "${boltpath}/llvm-project/patch_diff.tmp"
+rm -rf "${boltpath}/llvm-project/patch_log.tmp"
+rm -rf "${boltpath}/llvm-project/patch_changed_list.tmp"
+
+cd "${boltpath}/llvm-project/openmp"
+
+for commit_id in $(git rev-list --reverse ${commit_id_begin}..HEAD); do
+  echo "checking $commit_id"
+  # Check if OpenMP part is changed.
+  if [ x"$(git diff --name-only --relative ${commit_id}~..${commit_id})" != x ]; then
+    echo "\n#########################\n"
+    # Create a diff file
+    git diff --relative ${commit_id}~..${commit_id} > "${boltpath}/llvm-project/patch_diff.tmp"
+    # Create a list of changed files
+    git diff --name-only --relative ${commit_id}~..${commit_id} > "${boltpath}/llvm-project/patch_changed_list.tmp"
+    # Create a log file for this
+    git log ${commit_id}~..${commit_id} --pretty=format:"%B" > "${boltpath}/llvm-project/patch_log.tmp"
+    echo "" >> "${boltpath}/llvm-project/patch_log.tmp"
+    echo "cherry-pick: $commit_id" >> "${boltpath}/llvm-project/patch_log.tmp"
+    echo "https://github.com/llvm/llvm-project/commit/${commit_id}" >> "${boltpath}/llvm-project/patch_log.tmp"
+    author_info="$(git show --pretty="%aN <%aE>" $commit_id | head -n 1)"
+    timestamp_info="$(git show --pretty="%cd" $commit_id | head -n 1)"
+
+    # Go to the BOLT repository
+    cd "${boltpath}"
+
+    # Apply the diff to the llvmomp branch
+    git apply "${boltpath}/llvm-project/patch_diff.tmp"
+    for changed_file in $(cat "${boltpath}/llvm-project/patch_changed_list.tmp"); do
+      git add $changed_file
+    done
+
+    # Commit it.
+    cat "${boltpath}/llvm-project/patch_log.tmp"
+    git commit -F "${boltpath}/llvm-project/patch_log.tmp" --author="$author_info" --date="$timestamp_info"
+
+    # Go to the llvm-project repository.
+    cd "${boltpath}/llvm-project/openmp"
+  fi
+done
+
+echo "complete"
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 14490c62f..a7e88d611 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -16,8 +16,8 @@ endif()
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
 
 # Set libomp version
-set(LIBOMP_VERSION_MAJOR 5)
-set(LIBOMP_VERSION_MINOR 0)
+set(LIBBOLT_VERSION_MAJOR 5)
+set(LIBBOLT_VERSION_MINOR 0)
 
 # These include files are in the cmake/ subdirectory
 include(LibompUtils)
@@ -34,6 +34,7 @@ if(${OPENMP_STANDALONE_BUILD})
   # Should assertions be enabled?  They are on by default.
   set(LIBOMP_ENABLE_ASSERTIONS TRUE CACHE BOOL
     "enable assertions?")
+  set(LIBOMPTARGET_PROFILING_SUPPORT FALSE)
 else() # Part of LLVM build
   # Determine the native architecture from LLVM.
   string(TOLOWER "${LLVM_TARGET_ARCH}" LIBOMP_NATIVE_ARCH)
@@ -65,14 +66,27 @@ else() # Part of LLVM build
     libomp_get_architecture(LIBOMP_ARCH)
   endif ()
   set(LIBOMP_ENABLE_ASSERTIONS ${LLVM_ENABLE_ASSERTIONS})
+  # Time profiling support
+  set(LIBOMPTARGET_PROFILING_SUPPORT ${OPENMP_ENABLE_LIBOMPTARGET_PROFILING})
 endif()
-libomp_check_variable(LIBOMP_ARCH 32e x86_64 32 i386 arm ppc64 ppc64le aarch64 mic mips mips64 riscv64)
+
+# FUJITSU A64FX is a special processor because its cache line size is 256.
+# We need to pass this information into kmp_config.h.
+if(LIBOMP_ARCH STREQUAL "aarch64")
+  libomp_is_aarch64_a64fx(LIBOMP_DETECT_AARCH64_A64FX)
+  if (LIBOMP_DETECT_AARCH64_A64FX)
+    set(LIBOMP_ARCH "aarch64_a64fx")
+    set(LIBOMP_ARCH_AARCH64_A64FX TRUE)
+  endif()
+endif()
+
+libomp_check_variable(LIBOMP_ARCH 32e x86_64 32 i386 arm ppc64 ppc64le aarch64 aarch64_a64fx mic mips mips64 riscv64)
 
 set(LIBOMP_LIB_TYPE normal CACHE STRING
   "Performance,Profiling,Stubs library (normal/profile/stubs)")
 libomp_check_variable(LIBOMP_LIB_TYPE normal profile stubs)
-# Set the OpenMP Year and Month assiociated with version
-set(LIBOMP_OMP_YEAR_MONTH 201611)
+# Set the OpenMP Year and Month associated with version
+set(LIBBOLT_OMP_YEAR_MONTH 201611)
 set(LIBOMP_MIC_ARCH knc CACHE STRING
   "Intel(R) Many Integrated Core Architecture (Intel(R) MIC Architecture) (knf/knc).  Ignored if not Intel(R) MIC Architecture build.")
 if("${LIBOMP_ARCH}" STREQUAL "mic")
@@ -114,7 +128,7 @@ set(LIBOMP_FFLAGS "" CACHE STRING
 # Turning this to FALSE aids parallel builds to not interfere with each other.
 # Currently, the testsuite module expects the just built OpenMP library to be located inside the exports/
 # directory.  TODO: have testsuite run under llvm-lit directly.  We can then get rid of copying to exports/
-set(LIBOMP_COPY_EXPORTS TRUE CACHE STRING
+set(LIBBOLT_COPY_EXPORTS FALSE CACHE STRING
   "Should exports be copied into source exports/ directory?")
 
 # HWLOC-support
@@ -123,10 +137,23 @@ set(LIBOMP_USE_HWLOC FALSE CACHE BOOL
 set(LIBOMP_HWLOC_INSTALL_DIR /usr/local CACHE PATH
   "Install path for hwloc library")
 
+# Argobots-support
+set(LIBOMP_USE_ARGOBOTS FALSE CACHE BOOL
+  "Use Argobots (http://www.argobots.org) as threading model?")
+set(LIBOMP_REMOVE_FORKJOIN_LOCK FALSE CACHE BOOL
+  "Remove FORK_JOIN_LOCK (experimental)")
+if(LIBOMP_USE_ARGOBOTS)
+  set(KMP_USE_ABT 1)
+  set(KMP_ABT_USE_SELF_INFO 1)
+  set(BOLT_THREAD_TYPE BOLT_THREAD_ARGOBOTS)
+else()
+  set(BOLT_THREAD_TYPE BOLT_THREAD_NATIVE)
+endif()
+
 # Get the build number from kmp_version.cpp
-libomp_get_build_number("${CMAKE_CURRENT_SOURCE_DIR}" LIBOMP_VERSION_BUILD)
-math(EXPR LIBOMP_VERSION_BUILD_YEAR "${LIBOMP_VERSION_BUILD}/10000")
-math(EXPR LIBOMP_VERSION_BUILD_MONTH_DAY "${LIBOMP_VERSION_BUILD}%10000")
+libomp_get_build_number("${CMAKE_CURRENT_SOURCE_DIR}" LIBBOLT_VERSION_BUILD)
+math(EXPR LIBBOLT_VERSION_BUILD_YEAR "${LIBBOLT_VERSION_BUILD}/10000")
+math(EXPR LIBBOLT_VERSION_BUILD_MONTH_DAY "${LIBBOLT_VERSION_BUILD}%10000")
 
 # Currently don't record any timestamps
 set(LIBOMP_BUILD_DATE "No_Timestamp")
@@ -136,6 +163,7 @@ set(IA32 FALSE)
 set(INTEL64 FALSE)
 set(ARM FALSE)
 set(AARCH64 FALSE)
+set(AARCH64_A64FX FALSE)
 set(PPC64BE FALSE)
 set(PPC64LE FALSE)
 set(PPC64 FALSE)
@@ -157,6 +185,8 @@ elseif("${LIBOMP_ARCH}" STREQUAL "ppc64le") # PPC64LE architecture
   set(PPC64 TRUE)
 elseif("${LIBOMP_ARCH}" STREQUAL "aarch64") # AARCH64 architecture
   set(AARCH64 TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "aarch64_a64fx") # AARCH64_A64FX architecture
+  set(AARCH64_A64FX TRUE)
 elseif("${LIBOMP_ARCH}" STREQUAL "mic") # Intel(R) Many Integrated Core Architecture
   set(MIC TRUE)
 elseif("${LIBOMP_ARCH}" STREQUAL "mips") # MIPS architecture
@@ -187,6 +217,11 @@ endif()
 set(LIBOMP_USE_ITT_NOTIFY TRUE CACHE BOOL
   "Enable ITT notify?")
 
+if(LIBOMP_USE_ARGOBOTS)
+  # BOLT does not support it.
+  set(LIBOMP_USE_ITT_NOTIFY FALSE)
+endif()
+
 # normal, profile, stubs library.
 set(NORMAL_LIBRARY FALSE)
 set(STUBS_LIBRARY FALSE)
@@ -216,8 +251,8 @@ if(WIN32)
 endif()
 
 # Getting legal type/arch
-libomp_get_legal_type(LIBOMP_LEGAL_TYPE)
-libomp_get_legal_arch(LIBOMP_LEGAL_ARCH)
+libomp_get_legal_type(LIBBOLT_LEGAL_TYPE)
+libomp_get_legal_arch(LIBBOLT_LEGAL_ARCH)
 
 # Compiler flag checks, library checks, threading check, etc.
 include(config-ix)
@@ -294,14 +329,14 @@ set(OMPT_DEFAULT FALSE)
 if ((LIBOMP_HAVE_OMPT_SUPPORT) AND (NOT WIN32))
   set(OMPT_DEFAULT TRUE)
 endif()
-set(LIBOMP_OMPT_SUPPORT ${OMPT_DEFAULT} CACHE BOOL
+set(LIBBOLT_OMPT_SUPPORT ${OMPT_DEFAULT} CACHE BOOL
   "OMPT-support?")
 
 set(LIBOMP_OMPT_DEBUG FALSE CACHE BOOL
   "Trace OMPT initialization?")
 set(LIBOMP_OMPT_OPTIONAL TRUE CACHE BOOL
   "OMPT-optional?")
-if(LIBOMP_OMPT_SUPPORT AND (NOT LIBOMP_HAVE_OMPT_SUPPORT))
+if(LIBBOLT_OMPT_SUPPORT AND (NOT LIBOMP_HAVE_OMPT_SUPPORT))
   libomp_error_say("OpenMP Tools Interface requested but not available in this implementation")
 endif()
 
@@ -317,19 +352,34 @@ if(LIBOMP_USE_HWLOC AND (NOT LIBOMP_HAVE_HWLOC))
   libomp_error_say("Hwloc requested but not available")
 endif()
 
+# Error check Argobots support after config-ix has run
+if(LIBOMP_USE_ARGOBOTS AND (NOT LIBOMP_HAVE_ARGOBOTS))
+  libomp_error_say("Argobots requested but not available")
+endif()
+
 # Hierarchical scheduling support
 set(LIBOMP_USE_HIER_SCHED FALSE CACHE BOOL
   "Hierarchical scheduling support?")
 
 # Setting final library name
 set(LIBOMP_DEFAULT_LIB_NAME libomp)
+set(LIBBOLT_DEFAULT_LIB_NAME libbolt)
 if(${PROFILE_LIBRARY})
-  set(LIBOMP_DEFAULT_LIB_NAME ${LIBOMP_DEFAULT_LIB_NAME}prof)
+  set(LIBBOLT_DEFAULT_LIB_NAME ${LIBBOLT_DEFAULT_LIB_NAME}prof)
 endif()
 if(${STUBS_LIBRARY})
-  set(LIBOMP_DEFAULT_LIB_NAME ${LIBOMP_DEFAULT_LIB_NAME}stubs)
+  set(LIBBOLT_DEFAULT_LIB_NAME ${LIBBOLT_DEFAULT_LIB_NAME}stubs)
+endif()
+set(LIBBOLT_LIB_NAME ${LIBBOLT_DEFAULT_LIB_NAME} CACHE STRING "Base BOLT library name")
+if (NOT DEFINED LIBOMP_LIB_NAME)
+  set(LIBOMP_LIB_NAME ${LIBOMP_DEFAULT_LIB_NAME} CACHE STRING "Base OpenMP library name")
+endif()
+if (OPENMP_STANDALONE_BUILD
+    AND NOT (${LIBOMP_LIB_NAME} STREQUAL ${LIBOMP_DEFAULT_LIB_NAME})
+    AND (${LIBBOLT_LIB_NAME} STREQUAL ${LIBBOLT_DEFAULT_LIB_NAME}))
+  # Use LIBOMP_LIB_NAME.
+  set(LIBBOLT_LIB_NAME, ${LIBOMP_LIB_NAME})
 endif()
-set(LIBOMP_LIB_NAME ${LIBOMP_DEFAULT_LIB_NAME} CACHE STRING "Base OMP library name")
 
 if(${LIBOMP_ENABLE_SHARED})
   set(LIBOMP_LIBRARY_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
@@ -341,7 +391,27 @@ else()
   set(LIBOMP_INSTALL_KIND ARCHIVE)
 endif()
 
-set(LIBOMP_LIB_FILE ${LIBOMP_LIB_NAME}${LIBOMP_LIBRARY_SUFFIX})
+set(LIBOMP_LIB_FILE ${LIBBOLT_LIB_NAME}${LIBOMP_LIBRARY_SUFFIX})
+
+# Set BOLT version, release date, and build information
+set(BOLT_VERSION "1.0")
+set(BOLT_NUMVERSION "10000300")
+set(BOLT_RELEASE_DATE "Tue May 5 6:00:00 UTC 2020")
+set(BOLT_COMPILER_CC ${CMAKE_C_COMPILER})
+set(BOLT_COMPILER_CXX ${CMAKE_CXX_COMPILER})
+if("${libomp_build_type_lowercase}" STREQUAL "release")
+  set(BOLT_COMPILER_CFLAGS ${CMAKE_C_FLAGS_RELEASE})
+  set(BOLT_COMPILER_CXXFLAGS ${CMAKE_CXX_FLAGS_RELEASE})
+elseif("${libomp_build_type_lowercase}" STREQUAL "debug")
+  set(BOLT_COMPILER_CFLAGS ${CMAKE_C_FLAGS_DEBUG})
+  set(BOLT_COMPILER_CXXFLAGS ${CMAKE_CXX_FLAGS_DEBUG})
+elseif("${libomp_build_type_lowercase}" STREQUAL "relwithdebinfo")
+  set(BOLT_COMPILER_CFLAGS ${CMAKE_C_FLAGS_RELWITHDEBINFO})
+  set(BOLT_COMPILER_CXXFLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+elseif("${libomp_build_type_lowercase}" STREQUAL "minsizerel")
+  set(BOLT_COMPILER_CFLAGS ${CMAKE_C_FLAGS_RELEASE})
+  set(BOLT_COMPILER_CXXFLAGS ${CMAKE_CXX_FLAGS_RELEASE})
+endif()
 
 # Optional backwards compatibility aliases.
 set(LIBOMP_INSTALL_ALIASES TRUE CACHE BOOL
@@ -359,24 +429,31 @@ if(${OPENMP_STANDALONE_BUILD})
   libomp_say("Library Type         -- ${LIBOMP_LIB_TYPE}")
   libomp_say("Fortran Modules      -- ${LIBOMP_FORTRAN_MODULES}")
   # will say development if all zeros
-  if(${LIBOMP_VERSION_BUILD} STREQUAL 00000000)
+  if(${LIBBOLT_VERSION_BUILD} STREQUAL 00000000)
     set(LIBOMP_BUILD Development)
   else()
-    set(LIBOMP_BUILD ${LIBOMP_VERSION_BUILD})
+    set(LIBOMP_BUILD ${LIBBOLT_VERSION_BUILD})
   endif()
   libomp_say("Build                -- ${LIBOMP_BUILD}")
   libomp_say("Use Stats-gathering  -- ${LIBOMP_STATS}")
   libomp_say("Use Debugger-support -- ${LIBOMP_USE_DEBUGGER}")
   libomp_say("Use ITT notify       -- ${LIBOMP_USE_ITT_NOTIFY}")
-  libomp_say("Use OMPT-support     -- ${LIBOMP_OMPT_SUPPORT}")
-  if(${LIBOMP_OMPT_SUPPORT})
+  libomp_say("Use OMPT-support     -- ${LIBBOLT_OMPT_SUPPORT}")
+  if(${LIBBOLT_OMPT_SUPPORT})
     libomp_say("Use OMPT-optional  -- ${LIBOMP_OMPT_OPTIONAL}")
   endif()
   libomp_say("Use Adaptive locks   -- ${LIBOMP_USE_ADAPTIVE_LOCKS}")
   libomp_say("Use quad precision   -- ${LIBOMP_USE_QUAD_PRECISION}")
   libomp_say("Use TSAN-support     -- ${LIBOMP_TSAN_SUPPORT}")
   libomp_say("Use Hwloc library    -- ${LIBOMP_USE_HWLOC}")
+  libomp_say("Use Argobots         -- ${LIBOMP_USE_ARGOBOTS}")
+  libomp_say("Use builtin Argobots -- ${LIBOMP_USE_BUILTIN_ARGOBOTS}")
+  libomp_say("Argobots path        -- ${LIBOMP_ARGOBOTS_INSTALL_DIR}")
 endif()
 
 add_subdirectory(src)
 add_subdirectory(test)
+
+# make these variables available for tools:
+set(LIBBOLT_LIBRARY_DIR ${LIBBOLT_LIBRARY_DIR} PARENT_SCOPE)
+set(LIBBOLT_INCLUDE_DIR ${LIBBOLT_INCLUDE_DIR} PARENT_SCOPE)
diff --git a/runtime/cmake/LibboltSymlinkArgobots.cmake b/runtime/cmake/LibboltSymlinkArgobots.cmake
new file mode 100644
index 000000000..5f1683962
--- /dev/null
+++ b/runtime/cmake/LibboltSymlinkArgobots.cmake
@@ -0,0 +1,13 @@
+# Create symbolic links for testing.
+
+FILE(GLOB LIBOMP_ARGOBOTS_LIBS "${LIBOMP_ARGOBOTS_INSTALL_DIR}/lib/${SHARED_LIBRARY_PREFIX}*${SHARED_LIBRARY_SUFFIX}*")
+foreach(LIBOMP_ARGOBOTS_LIB IN LISTS LIBOMP_ARGOBOTS_LIBS)
+  get_filename_component(LIBOMP_ARGOBOTS_LIB_NAME "${LIBOMP_ARGOBOTS_LIB}" NAME)
+  execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${LIBOMP_ARGOBOTS_LIB}" "${LIBOMP_ARGOBOTS_LIB_NAME}"
+    WORKING_DIRECTORY ${LIBBOLT_LIBRARY_DIR}
+  )
+endforeach()
+execute_process(
+  COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_ARGOBOTS_INSTALL_DIR}/include/abt.h abt.h
+    WORKING_DIRECTORY ${LIBBOLT_INCLUDE_DIR}
+)
diff --git a/runtime/cmake/LibompCheckFortranFlag.cmake b/runtime/cmake/LibompCheckFortranFlag.cmake
index 21837ef06..b8cdb28a4 100644
--- a/runtime/cmake/LibompCheckFortranFlag.cmake
+++ b/runtime/cmake/LibompCheckFortranFlag.cmake
@@ -19,54 +19,9 @@ function(libomp_check_fortran_flag flag boolean)
            print *, \"Hello World!\"
       end program hello")
 
-  set(failed_regexes "[Ee]rror;[Uu]nknown;[Ss]kipping")
-  if(CMAKE_VERSION VERSION_GREATER 3.1 OR CMAKE_VERSION VERSION_EQUAL 3.1)
+    set(failed_regexes "[Ee]rror;[Uu]nknown;[Ss]kipping")
     include(CheckFortranSourceCompiles)
     check_fortran_source_compiles("${fortran_source}" ${boolean} FAIL_REGEX "${failed_regexes}")
     set(${boolean} ${${boolean}} PARENT_SCOPE)
-    return()
-  else()
-    # Our manual check for cmake versions that don't have CheckFortranSourceCompiles
-    set(base_dir ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/fortran_flag_check)
-    file(MAKE_DIRECTORY ${base_dir})
-    file(WRITE ${base_dir}/fortran_source.f "${fortran_source}")
-
-    message(STATUS "Performing Test ${boolean}")
-    execute_process(
-      COMMAND ${CMAKE_Fortran_COMPILER} "${flag}" ${base_dir}/fortran_source.f
-      WORKING_DIRECTORY ${base_dir}
-      RESULT_VARIABLE exit_code
-      OUTPUT_VARIABLE OUTPUT
-      ERROR_VARIABLE OUTPUT
-    )
-
-    if(${exit_code} EQUAL 0)
-      foreach(regex IN LISTS failed_regexes)
-        if("${OUTPUT}" MATCHES ${regex})
-          set(retval FALSE)
-        endif()
-      endforeach()
-    else()
-      set(retval FALSE)
-    endif()
-
-    if(${retval})
-      set(${boolean} 1 CACHE INTERNAL "Test ${boolean}")
-      message(STATUS "Performing Test ${boolean} - Success")
-      file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
-        "Performing Fortran Compiler Flag test ${boolean} succeeded with the following output:\n"
-        "${OUTPUT}\n"
-        "Source file was:\n${fortran_source}\n")
-    else()
-      set(${boolean} "" CACHE INTERNAL "Test ${boolean}")
-      message(STATUS "Performing Test ${boolean} - Failed")
-      file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
-        "Performing Fortran Compiler Flag test ${boolean} failed with the following output:\n"
-        "${OUTPUT}\n"
-        "Source file was:\n${fortran_source}\n")
-    endif()
-  endif()
-
-  set(${boolean} ${retval} PARENT_SCOPE)
   endif()
 endfunction()
diff --git a/runtime/cmake/LibompCheckLinkerFlag.cmake b/runtime/cmake/LibompCheckLinkerFlag.cmake
index 81ce9b0bb..e601e53f1 100644
--- a/runtime/cmake/LibompCheckLinkerFlag.cmake
+++ b/runtime/cmake/LibompCheckLinkerFlag.cmake
@@ -17,7 +17,7 @@ function(libomp_check_linker_flag flag boolean)
   set(library_source
     "int foo(int a) { return a*a; }")
   set(cmake_source
-    "cmake_minimum_required(VERSION 2.8)
+    "cmake_minimum_required(VERSION 3.13.4)
      project(foo C)
      set(CMAKE_SHARED_LINKER_FLAGS \"${flag}\")
      add_library(foo SHARED src_to_link.c)")
@@ -38,7 +38,8 @@ function(libomp_check_linker_flag flag boolean)
 
   if(try_compile_result)
     foreach(regex IN LISTS failed_regexes)
-      if("${OUTPUT}" MATCHES ${regex})
+      # Ignore the warning about the newer or unknown CUDA version.
+      if(("${OUTPUT}" MATCHES ${regex}) AND NOT ("${OUTPUT}" MATCHES "Unknown CUDA version"))
         set(retval FALSE)
       endif()
     endforeach()
diff --git a/runtime/cmake/LibompExports.cmake b/runtime/cmake/LibompExports.cmake
index f98de2631..112701194 100644
--- a/runtime/cmake/LibompExports.cmake
+++ b/runtime/cmake/LibompExports.cmake
@@ -21,8 +21,8 @@ libomp_append(libomp_suffix .deb DEBUG_BUILD)
 libomp_append(libomp_suffix .dia RELWITHDEBINFO_BUILD)
 libomp_append(libomp_suffix .min MINSIZEREL_BUILD)
 libomp_append(libomp_suffix .s1 LIBOMP_STATS)
-libomp_append(libomp_suffix .ompt LIBOMP_OMPT_SUPPORT)
-if(${LIBOMP_OMPT_SUPPORT})
+libomp_append(libomp_suffix .ompt LIBBOLT_OMPT_SUPPORT)
+if(${LIBBOLT_OMPT_SUPPORT})
   libomp_append(libomp_suffix .optional LIBOMP_OMPT_OPTIONAL)
 endif()
 string(REPLACE ";" "" libomp_suffix "${libomp_suffix}")
@@ -47,12 +47,12 @@ set(LIBOMP_EXPORTS_MOD_DIR "${LIBOMP_EXPORTS_PLATFORM_DIR}/include_compat")
 set(LIBOMP_EXPORTS_LIB_DIR "${LIBOMP_EXPORTS_DIR}/${libomp_platform}${libomp_suffix}/lib")
 
 # Put headers in exports/ directory post build
-add_custom_command(TARGET omp POST_BUILD
+add_custom_command(TARGET bolt-omp POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBOMP_EXPORTS_CMN_DIR}
   COMMAND ${CMAKE_COMMAND} -E copy omp.h ${LIBOMP_EXPORTS_CMN_DIR}
 )
-if(${LIBOMP_OMPT_SUPPORT})
-  add_custom_command(TARGET omp POST_BUILD
+if(${LIBBOLT_OMPT_SUPPORT})
+  add_custom_command(TARGET bolt-omp POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy omp-tools.h ${LIBOMP_EXPORTS_CMN_DIR}
   )
 endif()
@@ -62,32 +62,32 @@ if(${LIBOMP_FORTRAN_MODULES})
     COMMAND ${CMAKE_COMMAND} -E copy omp_lib.mod ${LIBOMP_EXPORTS_MOD_DIR}
     COMMAND ${CMAKE_COMMAND} -E copy omp_lib_kinds.mod ${LIBOMP_EXPORTS_MOD_DIR}
   )
-  add_custom_command(TARGET omp POST_BUILD
+  add_custom_command(TARGET bolt-omp POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy omp_lib.h ${LIBOMP_EXPORTS_CMN_DIR}
   )
 endif()
 
 # Copy OpenMP library into exports/ directory post build
 if(WIN32)
-  get_target_property(LIBOMP_OUTPUT_DIRECTORY omp RUNTIME_OUTPUT_DIRECTORY)
+  get_target_property(LIBOMP_OUTPUT_DIRECTORY bolt-omp RUNTIME_OUTPUT_DIRECTORY)
 else()
-  get_target_property(LIBOMP_OUTPUT_DIRECTORY omp LIBRARY_OUTPUT_DIRECTORY)
+  get_target_property(LIBOMP_OUTPUT_DIRECTORY bolt-omp LIBRARY_OUTPUT_DIRECTORY)
 endif()
 if(NOT LIBOMP_OUTPUT_DIRECTORY)
   set(LIBOMP_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif()
-add_custom_command(TARGET omp POST_BUILD
+add_custom_command(TARGET bolt-omp POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBOMP_EXPORTS_LIB_DIR}
-  COMMAND ${CMAKE_COMMAND} -E copy ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE} ${LIBOMP_EXPORTS_LIB_DIR}
+  COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:omp> ${LIBOMP_EXPORTS_LIB_DIR}
 )
 
 # Copy Windows import library into exports/ directory post build
 if(WIN32)
-  get_target_property(LIBOMPIMP_OUTPUT_DIRECTORY ompimp ARCHIVE_OUTPUT_DIRECTORY)
+  get_target_property(LIBOMPIMP_OUTPUT_DIRECTORY ${LIBOMP_IMP_LIB_TARGET} ARCHIVE_OUTPUT_DIRECTORY)
   if(NOT LIBOMPIMP_OUTPUT_DIRECTORY)
     set(LIBOMPIMP_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
-  add_custom_command(TARGET ompimp POST_BUILD
+  add_custom_command(TARGET ${LIBOMP_IMP_LIB_TARGET} POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBOMP_EXPORTS_LIB_DIR}
     COMMAND ${CMAKE_COMMAND} -E copy ${LIBOMPIMP_OUTPUT_DIRECTORY}/${LIBOMP_IMP_LIB_FILE} ${LIBOMP_EXPORTS_LIB_DIR}
   )
diff --git a/runtime/cmake/LibompGetArchitecture.cmake b/runtime/cmake/LibompGetArchitecture.cmake
index 897f99a39..dd60a2d34 100644
--- a/runtime/cmake/LibompGetArchitecture.cmake
+++ b/runtime/cmake/LibompGetArchitecture.cmake
@@ -69,3 +69,18 @@ function(libomp_get_architecture return_arch)
   # Remove ${detect_arch_src_txt} from cmake/ subdirectory
   file(REMOVE "${CMAKE_CURRENT_BINARY_DIR}/libomp_detect_arch.c")
 endfunction()
+
+function(libomp_is_aarch64_a64fx return_is_aarch64_a64fx)
+  set(is_aarch64_a64fx FALSE)
+  if (EXISTS "/proc/cpuinfo")
+    file(READ "/proc/cpuinfo" cpu_info_content)
+    string(REGEX MATCH "CPU implementer[ \t]*: 0x46\n" cpu_implementer ${cpu_info_content})
+    string(REGEX MATCH "CPU architecture[ \t]*: 8\n" cpu_architecture ${cpu_info_content})
+
+    if (cpu_architecture AND cpu_implementer)
+      set(is_aarch64_a64fx TRUE)
+    endif()
+  endif()
+
+  set(${return_is_aarch64_a64fx} "${is_aarch64_a64fx}" PARENT_SCOPE)
+endfunction(libomp_is_aarch64_a64fx)
diff --git a/runtime/cmake/LibompHandleFlags.cmake b/runtime/cmake/LibompHandleFlags.cmake
index 046c5d813..46c891457 100644
--- a/runtime/cmake/LibompHandleFlags.cmake
+++ b/runtime/cmake/LibompHandleFlags.cmake
@@ -36,6 +36,7 @@ function(libomp_get_cxxflags cxxflags)
   libomp_append(flags_local -Wno-switch LIBOMP_HAVE_WNO_SWITCH_FLAG)
   libomp_append(flags_local -Wno-uninitialized LIBOMP_HAVE_WNO_UNINITIALIZED_FLAG)
   libomp_append(flags_local -Wno-unused-but-set-variable LIBOMP_HAVE_WNO_UNUSED_BUT_SET_VARIABLE_FLAG)
+  # libomp_append(flags_local -Wconversion LIBOMP_HAVE_WCONVERSION_FLAG)
   libomp_append(flags_local /GS LIBOMP_HAVE_GS_FLAG)
   libomp_append(flags_local /EHsc LIBOMP_HAVE_EHSC_FLAG)
   libomp_append(flags_local /Oy- LIBOMP_HAVE_OY__FLAG)
@@ -88,18 +89,17 @@ endfunction()
 # Linker flags
 function(libomp_get_ldflags ldflags)
   set(ldflags_local)
-  libomp_append(ldflags_local "${CMAKE_LINK_DEF_FILE_FLAG}${CMAKE_CURRENT_BINARY_DIR}/${LIBOMP_LIB_NAME}.def"
+  libomp_append(ldflags_local "${CMAKE_LINK_DEF_FILE_FLAG}${CMAKE_CURRENT_BINARY_DIR}/${LIBBOLT_LIB_NAME}.def"
     IF_DEFINED CMAKE_LINK_DEF_FILE_FLAG)
-  libomp_append(ldflags_local "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}${LIBOMP_VERSION_MAJOR}.${LIBOMP_VERSION_MINOR}"
+  libomp_append(ldflags_local "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}${LIBBOLT_VERSION_MAJOR}.${LIBBOLT_VERSION_MINOR}"
     IF_DEFINED CMAKE_C_OSX_CURRENT_VERSION_FLAG)
-  libomp_append(ldflags_local "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}${LIBOMP_VERSION_MAJOR}.${LIBOMP_VERSION_MINOR}"
+  libomp_append(ldflags_local "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}${LIBBOLT_VERSION_MAJOR}.${LIBBOLT_VERSION_MINOR}"
     IF_DEFINED CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG)
   libomp_append(ldflags_local -Wl,--warn-shared-textrel LIBOMP_HAVE_WARN_SHARED_TEXTREL_FLAG)
   libomp_append(ldflags_local -Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG)
   libomp_append(ldflags_local "-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
   libomp_append(ldflags_local -static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG)
   libomp_append(ldflags_local -Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG)
-  libomp_append(ldflags_local -Wl,-fini=__kmp_internal_end_fini LIBOMP_HAVE_FINI_FLAG)
   libomp_append(ldflags_local -no-intel-extensions LIBOMP_HAVE_NO_INTEL_EXTENSIONS_FLAG)
   libomp_append(ldflags_local -static-intel LIBOMP_HAVE_STATIC_INTEL_FLAG)
   libomp_append(ldflags_local /SAFESEH LIBOMP_HAVE_SAFESEH_FLAG)
@@ -123,9 +123,13 @@ function(libomp_get_libflags libflags)
   set(libflags_local)
   libomp_append(libflags_local "${CMAKE_THREAD_LIBS_INIT}")
   libomp_append(libflags_local "${LIBOMP_HWLOC_LIBRARY}" LIBOMP_USE_HWLOC)
+  libomp_append(libflags_local "${LIBOMP_ARGOBOTS_LIBRARY}" LIBOMP_USE_ARGOBOTS)
   if(${IA32})
     libomp_append(libflags_local -lirc_pic LIBOMP_HAVE_IRC_PIC_LIBRARY)
   endif()
+  if(LIBOMP_HAVE_SHM_OPEN_WITH_LRT)
+    libomp_append(libflags_local -lrt)
+  endif()
   if(${CMAKE_SYSTEM_NAME} MATCHES "DragonFly|FreeBSD")
     libomp_append(libflags_local "-Wl,--no-as-needed" LIBOMP_HAVE_AS_NEEDED_FLAG)
     libomp_append(libflags_local "-lm")
diff --git a/runtime/cmake/LibompMicroTests.cmake b/runtime/cmake/LibompMicroTests.cmake
index a7bf8240b..2b5b47711 100644
--- a/runtime/cmake/LibompMicroTests.cmake
+++ b/runtime/cmake/LibompMicroTests.cmake
@@ -22,10 +22,10 @@
 #  - Available for Unix, Intel(R) MIC Architecture dynamic library builds. Not available otherwise.
 # (3) test-execstack
 #  - Tests if stack is executable
-#  - Fails if stack is executable. Should only be readable and writable. Not exectuable.
+#  - Fails if stack is executable. Should only be readable and writable. Not executable.
 #  - Program dependencies: perl, readelf
 #  - Available for Unix dynamic library builds. Not available otherwise.
-# (4) test-instr (Intel(R) MIC Architecutre only)
+# (4) test-instr (Intel(R) MIC Architecture only)
 #  - Tests Intel(R) MIC Architecture libraries for valid instruction set
 #  - Fails if finds invalid instruction for Intel(R) MIC Architecture (wasn't compiled with correct flags)
 #  - Program dependencies: perl, objdump
@@ -39,13 +39,13 @@
 
 # get library location
 if(WIN32)
-  get_target_property(LIBOMP_OUTPUT_DIRECTORY omp RUNTIME_OUTPUT_DIRECTORY)
-  get_target_property(LIBOMPIMP_OUTPUT_DIRECTORY ompimp ARCHIVE_OUTPUT_DIRECTORY)
+  get_target_property(LIBOMP_OUTPUT_DIRECTORY bolt-omp RUNTIME_OUTPUT_DIRECTORY)
+  get_target_property(LIBOMPIMP_OUTPUT_DIRECTORY ${LIBOMP_IMP_LIB_TARGET} ARCHIVE_OUTPUT_DIRECTORY)
   if(NOT LIBOMPIMP_OUTPUT_DIRECTORY)
     set(LIBOMPIMP_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
 else()
-  get_target_property(LIBOMP_OUTPUT_DIRECTORY omp LIBRARY_OUTPUT_DIRECTORY)
+  get_target_property(LIBOMP_OUTPUT_DIRECTORY bolt-omp LIBRARY_OUTPUT_DIRECTORY)
 endif()
 if(NOT LIBOMP_OUTPUT_DIRECTORY)
   set(LIBOMP_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
@@ -82,16 +82,19 @@ else() # (Unix based systems, Intel(R) MIC Architecture, and Mac)
   endif()
   libomp_append(libomp_test_touch_libs ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE})
   libomp_append(libomp_test_touch_libs "${LIBOMP_HWLOC_LIBRARY}" LIBOMP_USE_HWLOC)
+  libomp_append(libomp_test_touch_libs "${LIBOMP_ARGOBOTS_LIBRARY}" LIBOMP_USE_ARGOBOTS)
   if(APPLE)
     set(libomp_test_touch_env "DYLD_LIBRARY_PATH=.:${LIBOMP_OUTPUT_DIRECTORY}:$ENV{DYLD_LIBRARY_PATH}")
     libomp_append(libomp_test_touch_ldflags "-Wl,-rpath,${LIBOMP_HWLOC_LIBRARY_DIR}" LIBOMP_USE_HWLOC)
+    libomp_append(libomp_test_touch_ldflags "-Wl,-rpath,${LIBOMP_ARGOBOTS_LIBRARY_DIR}" LIBOMP_USE_ARGOBOTS)
   else()
     set(libomp_test_touch_env "LD_LIBRARY_PATH=.:${LIBOMP_OUTPUT_DIRECTORY}:$ENV{LD_LIBRARY_PATH}")
     libomp_append(libomp_test_touch_ldflags "-Wl,-rpath=${LIBOMP_HWLOC_LIBRARY_DIR}" LIBOMP_USE_HWLOC)
+    libomp_append(libomp_test_touch_ldflags "-Wl,-rpath=${LIBOMP_ARGOBOTS_LIBRARY_DIR}" LIBOMP_USE_ARGOBOTS)
   endif()
 endif()
 macro(libomp_test_touch_recipe test_touch_dir)
-  set(libomp_test_touch_dependencies ${LIBOMP_SRC_DIR}/test-touch.c omp)
+  set(libomp_test_touch_dependencies ${LIBOMP_SRC_DIR}/test-touch.c bolt-omp)
   set(libomp_test_touch_exe ${test_touch_dir}/test-touch${CMAKE_EXECUTABLE_SUFFIX})
   set(libomp_test_touch_obj ${test_touch_dir}/test-touch${CMAKE_C_OUTPUT_EXTENSION})
   if(WIN32)
@@ -109,7 +112,7 @@ macro(libomp_test_touch_recipe test_touch_dir)
       endif()
     endif()
     set(libomp_test_touch_out_flags -Fe${libomp_test_touch_exe} -Fo${libomp_test_touch_obj})
-    list(APPEND libomp_test_touch_dependencies ompimp)
+    list(APPEND libomp_test_touch_dependencies bolt-ompimp)
   else()
     set(libomp_test_touch_out_flags -o ${libomp_test_touch_exe})
   endif()
@@ -125,7 +128,7 @@ macro(libomp_test_touch_recipe test_touch_dir)
   )
 endmacro()
 libomp_append(libomp_test_touch_env "KMP_VERSION=1")
-add_custom_target(libomp-test-touch DEPENDS ${libomp_test_touch_targets})
+add_custom_target(bolt-libomp-test-touch DEPENDS ${libomp_test_touch_targets})
 if(WIN32)
   libomp_test_touch_recipe(test-touch-mt)
   libomp_test_touch_recipe(test-touch-md)
@@ -134,55 +137,59 @@ else()
 endif()
 
 # test-relo
-add_custom_target(libomp-test-relo DEPENDS test-relo/.success)
+add_custom_target(bolt-libomp-test-relo DEPENDS test-relo/.success)
 add_custom_command(
   OUTPUT  test-relo/.success test-relo/readelf.log
   COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/test-relo
   COMMAND readelf -d ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE} > test-relo/readelf.log
   COMMAND grep -e TEXTREL test-relo/readelf.log \; test $$? -eq 1
   COMMAND ${CMAKE_COMMAND} -E touch test-relo/.success
-  DEPENDS omp
+  DEPENDS bolt-omp
 )
 
 # test-execstack
-add_custom_target(libomp-test-execstack DEPENDS test-execstack/.success)
+add_custom_target(bolt-libomp-test-execstack DEPENDS test-execstack/.success)
 add_custom_command(
   OUTPUT  test-execstack/.success
   COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/test-execstack
   COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/check-execstack.pl
     --arch=${LIBOMP_PERL_SCRIPT_ARCH} ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE}
   COMMAND ${CMAKE_COMMAND} -E touch test-execstack/.success
-  DEPENDS omp
+  DEPENDS bolt-omp
 )
 
 # test-instr
-add_custom_target(libomp-test-instr DEPENDS test-instr/.success)
+add_custom_target(bolt-libomp-test-instr DEPENDS test-instr/.success)
 add_custom_command(
   OUTPUT  test-instr/.success
   COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/test-instr
   COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/check-instruction-set.pl --os=${LIBOMP_PERL_SCRIPT_OS}
     --arch=${LIBOMP_PERL_SCRIPT_ARCH} --show --mic-arch=${LIBOMP_MIC_ARCH} ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE}
   COMMAND ${CMAKE_COMMAND} -E touch test-instr/.success
-  DEPENDS omp ${LIBOMP_TOOLS_DIR}/check-instruction-set.pl
+  DEPENDS bolt-omp ${LIBOMP_TOOLS_DIR}/check-instruction-set.pl
 )
 
 # test-deps
-add_custom_target(libomp-test-deps DEPENDS test-deps/.success)
+add_custom_target(bolt-libomp-test-deps DEPENDS test-deps/.success)
 set(libomp_expected_library_deps)
 if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
   set(libomp_expected_library_deps libc.so.7 libthr.so.3 libm.so.5)
   libomp_append(libomp_expected_library_deps libhwloc.so.5 LIBOMP_USE_HWLOC)
+  libomp_append(libomp_expected_library_deps libabt.so LIBOMP_USE_ARGOBOTS)
 elseif(CMAKE_SYSTEM_NAME MATCHES "NetBSD")
   set(libomp_expected_library_deps libc.so.12 libpthread.so.1 libm.so.0)
   libomp_append(libomp_expected_library_deps libhwloc.so.5 LIBOMP_USE_HWLOC)
+  libomp_append(libomp_expected_library_deps libabt.so LIBOMP_USE_ARGOBOTS)
 elseif(CMAKE_SYSTEM_NAME MATCHES "DragonFly")
   set(libomp_expected_library_deps libc.so.8 libpthread.so.0 libm.so.4)
   libomp_append(libomp_expected_library_deps libhwloc.so.5 LIBOMP_USE_HWLOC)
+  libomp_append(libomp_expected_library_deps libabt.so LIBOMP_USE_ARGOBOTS)
 elseif(APPLE)
   set(libomp_expected_library_deps /usr/lib/libSystem.B.dylib)
+  libomp_append(libomp_expected_library_deps libabt.dylib LIBOMP_USE_ARGOBOTS)
 elseif(WIN32)
   set(libomp_expected_library_deps kernel32.dll)
-  libomp_append(libomp_expected_library_deps psapi.dll LIBOMP_OMPT_SUPPORT)
+  libomp_append(libomp_expected_library_deps psapi.dll LIBBOLT_OMPT_SUPPORT)
 else()
   if(${MIC})
     set(libomp_expected_library_deps libc.so.6 libpthread.so.0 libdl.so.2)
@@ -218,6 +225,7 @@ else()
     libomp_append(libomp_expected_library_deps libpthread.so.0 IF_FALSE STUBS_LIBRARY)
     libomp_append(libomp_expected_library_deps libhwloc.so.5 LIBOMP_USE_HWLOC)
   endif()
+  libomp_append(libomp_expected_library_deps libabt.so LIBOMP_USE_ARGOBOTS)
   libomp_append(libomp_expected_library_deps libstdc++.so.6 LIBOMP_USE_STDCPPLIB)
   libomp_append(libomp_expected_library_deps libm.so.6 LIBOMP_STATS)
 endif()
@@ -229,5 +237,5 @@ add_custom_command(
   COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/check-depends.pl --os=${LIBOMP_PERL_SCRIPT_OS}
     --arch=${LIBOMP_PERL_SCRIPT_ARCH} --expected="${libomp_expected_library_deps}" ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE}
   COMMAND ${CMAKE_COMMAND} -E touch test-deps/.success
-  DEPENDS omp ${LIBOMP_TOOLS_DIR}/check-depends.pl
+  DEPENDS bolt-omp ${LIBOMP_TOOLS_DIR}/check-depends.pl
 )
diff --git a/runtime/cmake/LibompUtils.cmake b/runtime/cmake/LibompUtils.cmake
index 44d236373..7d22ad7a9 100644
--- a/runtime/cmake/LibompUtils.cmake
+++ b/runtime/cmake/LibompUtils.cmake
@@ -11,19 +11,19 @@
 # void libomp_say(string message_to_user);
 # - prints out message_to_user
 macro(libomp_say message_to_user)
-  message(STATUS "LIBOMP: ${message_to_user}")
+  message(STATUS "BOLT-LIBOMP: ${message_to_user}")
 endmacro()
 
 # void libomp_warning_say(string message_to_user);
 # - prints out message_to_user with a warning
 macro(libomp_warning_say message_to_user)
-  message(WARNING "LIBOMP: ${message_to_user}")
+  message(WARNING "BOLT-LIBOMP: ${message_to_user}")
 endmacro()
 
 # void libomp_error_say(string message_to_user);
 # - prints out message_to_user with an error and exits cmake
 macro(libomp_error_say message_to_user)
-  message(FATAL_ERROR "LIBOMP: ${message_to_user}")
+  message(FATAL_ERROR "BOLT-LIBOMP: ${message_to_user}")
 endmacro()
 
 # libomp_append(<flag> <flags_list> [(IF_TRUE | IF_FALSE | IF_TRUE_1_0 ) BOOLEAN])
@@ -101,6 +101,8 @@ function(libomp_get_legal_arch return_arch_string)
     set(${return_arch_string} "PPC64LE" PARENT_SCOPE)
   elseif(${AARCH64})
     set(${return_arch_string} "AARCH64" PARENT_SCOPE)
+  elseif(${AARCH64_A64FX})
+    set(${return_arch_string} "AARCH64_A64FX" PARENT_SCOPE)
   elseif(${MIPS})
     set(${return_arch_string} "MIPS" PARENT_SCOPE)
   elseif(${MIPS64})
diff --git a/runtime/cmake/config-ix.cmake b/runtime/cmake/config-ix.cmake
index 6a19e322f..747764da3 100644
--- a/runtime/cmake/config-ix.cmake
+++ b/runtime/cmake/config-ix.cmake
@@ -10,10 +10,12 @@
 
 include(CheckCCompilerFlag)
 include(CheckCSourceCompiles)
+include(CheckCXXSourceCompiles)
 include(CheckCXXCompilerFlag)
 include(CheckIncludeFile)
 include(CheckLibraryExists)
 include(CheckIncludeFiles)
+include(CheckSymbolExists)
 include(LibompCheckLinkerFlag)
 include(LibompCheckFortranFlag)
 
@@ -57,6 +59,7 @@ check_cxx_compiler_flag(-Wno-stringop-truncation LIBOMP_HAVE_WNO_STRINGOP_TRUNCA
 check_cxx_compiler_flag(-Wno-switch LIBOMP_HAVE_WNO_SWITCH_FLAG)
 check_cxx_compiler_flag(-Wno-uninitialized LIBOMP_HAVE_WNO_UNINITIALIZED_FLAG)
 check_cxx_compiler_flag(-Wno-unused-but-set-variable LIBOMP_HAVE_WNO_UNUSED_BUT_SET_VARIABLE_FLAG)
+# check_cxx_compiler_flag(-Wconversion LIBOMP_HAVE_WCONVERSION_FLAG)
 check_cxx_compiler_flag(-msse2 LIBOMP_HAVE_MSSE2_FLAG)
 check_cxx_compiler_flag(-ftls-model=initial-exec LIBOMP_HAVE_FTLS_MODEL_FLAG)
 libomp_check_architecture_flag(-mmic LIBOMP_HAVE_MMIC_FLAG)
@@ -87,7 +90,7 @@ if(WIN32)
   endforeach()
 else()
   # It is difficult to create a dummy assembly file that compiles into an
-  # exectuable for every architecture and then check the C compiler to
+  # executable for every architecture and then check the C compiler to
   # see if -x assembler-with-cpp exists and works, so we assume it does for non-Windows.
   set(LIBOMP_HAVE_X_ASSEMBLER_WITH_CPP_FLAG TRUE)
 endif()
@@ -95,6 +98,14 @@ if(${LIBOMP_FORTRAN_MODULES})
   libomp_check_fortran_flag(-m32 LIBOMP_HAVE_M32_FORTRAN_FLAG)
 endif()
 
+# Check for Unix shared memory
+check_symbol_exists(shm_open "sys/mman.h" LIBOMP_HAVE_SHM_OPEN_NO_LRT)
+if (NOT LIBOMP_HAVE_SHM_OPEN_NO_LRT)
+  set(CMAKE_REQUIRED_LIBRARIES -lrt)
+  check_symbol_exists(shm_open "sys/mman.h" LIBOMP_HAVE_SHM_OPEN_WITH_LRT)
+  set(CMAKE_REQUIRED_LIBRARIES)
+endif()
+
 # Check linker flags
 if(WIN32)
   libomp_check_linker_flag(/SAFESEH LIBOMP_HAVE_SAFESEH_FLAG)
@@ -105,7 +116,6 @@ elseif(NOT APPLE)
   libomp_check_linker_flag("-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
   libomp_check_linker_flag(-static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG)
   libomp_check_linker_flag(-Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG)
-  libomp_check_linker_flag(-Wl,-fini=__kmp_internal_end_fini LIBOMP_HAVE_FINI_FLAG)
 endif()
 
 # Check Intel(R) C Compiler specific flags
@@ -133,6 +143,53 @@ else()
   endif()
 endif()
 
+# Checking for x86-specific waitpkg and rtm attribute and intrinsics
+if (IA32 OR INTEL64)
+  check_include_file(immintrin.h LIBOMP_HAVE_IMMINTRIN_H)
+  if (NOT LIBOMP_HAVE_IMMINTRIN_H)
+    check_include_file(intrin.h LIBOMP_HAVE_INTRIN_H)
+  endif()
+  check_cxx_source_compiles("__attribute__((target(\"rtm\")))
+                             int main() {return 0;}" LIBOMP_HAVE_ATTRIBUTE_RTM)
+  check_cxx_source_compiles("__attribute__((target(\"waitpkg\")))
+                            int main() {return 0;}" LIBOMP_HAVE_ATTRIBUTE_WAITPKG)
+  libomp_append(CMAKE_REQUIRED_DEFINITIONS -DIMMINTRIN_H LIBOMP_HAVE_IMMINTRIN_H)
+  libomp_append(CMAKE_REQUIRED_DEFINITIONS -DINTRIN_H LIBOMP_HAVE_INTRIN_H)
+  libomp_append(CMAKE_REQUIRED_DEFINITIONS -DATTRIBUTE_WAITPKG LIBOMP_HAVE_ATTRIBUTE_WAITPKG)
+  libomp_append(CMAKE_REQUIRED_DEFINITIONS -DATTRIBUTE_RTM LIBOMP_HAVE_ATTRIBUTE_RTM)
+  set(source_code "// check for attribute and wait pkg intrinsics
+      #ifdef IMMINTRIN_H
+      #include <immintrin.h>
+      #endif
+      #ifdef INTRIN_H
+      #include <intrin.h>
+      #endif
+      #ifdef ATTRIBUTE_WAITPKG
+      __attribute__((target(\"waitpkg\")))
+      #endif
+      static inline int __kmp_umwait(unsigned hint, unsigned long long counter) {
+        return _umwait(hint, counter);
+      }
+      int main() { int a = __kmp_umwait(0, 1000); return a; }")
+  check_cxx_source_compiles("${source_code}" LIBOMP_HAVE_WAITPKG_INTRINSICS)
+  set(source_code "// check for attribute rtm and rtm intrinsics
+      #ifdef IMMINTRIN_H
+      #include <immintrin.h>
+      #endif
+      #ifdef INTRIN_H
+      #include <intrin.h>
+      #endif
+      #ifdef ATTRIBUTE_RTM
+      __attribute__((target(\"rtm\")))
+      #endif
+      static inline int __kmp_xbegin() {
+        return _xbegin();
+      }
+      int main() { int a = __kmp_xbegin(); return a; }")
+  check_cxx_source_compiles("${source_code}" LIBOMP_HAVE_RTM_INTRINSICS)
+  set(CMAKE_REQUIRED_DEFINITIONS)
+endif()
+
 # Find perl executable
 # Perl is used to create omp.h (and other headers) along with kmp_i18n_id.inc and kmp_i18n_default.inc
 find_package(Perl REQUIRED)
@@ -234,6 +291,7 @@ else()
       (LIBOMP_ARCH STREQUAL i386) OR
 #      (LIBOMP_ARCH STREQUAL arm) OR
       (LIBOMP_ARCH STREQUAL aarch64) OR
+      (LIBOMP_ARCH STREQUAL aarch64_a64fx) OR
       (LIBOMP_ARCH STREQUAL ppc64le) OR
       (LIBOMP_ARCH STREQUAL ppc64) OR
       (LIBOMP_ARCH STREQUAL riscv64))
@@ -266,6 +324,35 @@ if(${LIBOMP_USE_HWLOC})
   endif()
 endif()
 
+# Check if ARGOBOTS support is available
+if(${LIBOMP_USE_ARGOBOTS})
+  if(WIN32)
+    set(LIBOMP_HAVE_ARGOBOTS FALSE)
+    libomp_say("Using Argobots not supported on Windows yet")
+  elseif(${LIBOMP_USE_BUILTIN_ARGOBOTS})
+    set(LIBOMP_ARGOBOTS_LIBRARY ${LIBOMP_ARGOBOTS_INSTALL_DIR}/lib/libabt${CMAKE_SHARED_LIBRARY_SUFFIX})
+    get_filename_component(LIBOMP_ARGOBOTS_LIBRARY_DIR ${LIBOMP_ARGOBOTS_LIBRARY} PATH)
+    set(LIBOMP_HAVE_ARGOBOTS TRUE)
+  else()
+    set(CMAKE_REQUIRED_INCLUDES ${LIBOMP_ARGOBOTS_INSTALL_DIR}/include)
+    check_include_file(abt.h LIBOMP_HAVE_ABT_H)
+    set(CMAKE_REQUIRED_INCLUDES)
+    check_library_exists(abt ABT_init
+      ${LIBOMP_ARGOBOTS_INSTALL_DIR}/lib LIBOMP_HAVE_LIBABT)
+    find_library(LIBOMP_ARGOBOTS_LIBRARY abt ${LIBOMP_ARGOBOTS_INSTALL_DIR}/lib)
+    get_filename_component(LIBOMP_ARGOBOTS_LIBRARY_DIR ${LIBOMP_ARGOBOTS_LIBRARY} PATH)
+    if(LIBOMP_HAVE_ABT_H AND LIBOMP_HAVE_LIBABT AND LIBOMP_ARGOBOTS_LIBRARY)
+      set(LIBOMP_HAVE_ARGOBOTS TRUE)
+    else()
+      set(LIBOMP_HAVE_ARGOBOTS FALSE)
+      libomp_say("Could not find Argobots")
+    endif()
+  endif()
+  libomp_say("BOLT does not support OMPT")
+  set(LIBBOLT_OMPT_SUPPORT FALSE)
+  set(LIBOMP_HAVE_OMPT_SUPPORT FALSE)
+endif()
+
 # Check if ThreadSanitizer support is available
 if("${CMAKE_SYSTEM_NAME}" MATCHES "Linux" AND ${INTEL64})
   set(LIBOMP_HAVE_TSAN_SUPPORT TRUE)
diff --git a/runtime/src/CMakeLists.txt b/runtime/src/CMakeLists.txt
index a5654d6d5..042a52008 100644
--- a/runtime/src/CMakeLists.txt
+++ b/runtime/src/CMakeLists.txt
@@ -9,9 +9,10 @@
 #
 
 # Configure omp.h, kmp_config.h and omp-tools.h if necessary
+
 configure_file(${LIBOMP_INC_DIR}/omp.h.var omp.h @ONLY)
 configure_file(kmp_config.h.cmake kmp_config.h @ONLY)
-if(${LIBOMP_OMPT_SUPPORT})
+if(${LIBBOLT_OMPT_SUPPORT})
   configure_file(${LIBOMP_INC_DIR}/omp-tools.h.var omp-tools.h @ONLY)
 endif()
 
@@ -50,6 +51,10 @@ if(${LIBOMP_USE_HWLOC})
   include_directories(${LIBOMP_HWLOC_INSTALL_DIR}/include)
 endif()
 
+if(${LIBOMP_USE_ARGOBOTS})
+  include_directories(${LIBOMP_ARGOBOTS_INSTALL_DIR}/include)
+endif()
+
 # Getting correct source files to build library
 set(LIBOMP_CXXFILES)
 set(LIBOMP_ASMFILES)
@@ -92,6 +97,11 @@ else()
     libomp_append(LIBOMP_CXXFILES kmp_gsupport.cpp)
     libomp_append(LIBOMP_ASMFILES z_Linux_asm.S) # Unix assembly file
   endif()
+
+  if(LIBOMP_USE_ARGOBOTS)
+    libomp_append(LIBOMP_CXXFILES kmp_abt_affinity.cpp)
+  endif()
+
   libomp_append(LIBOMP_CXXFILES thirdparty/ittnotify/ittnotify_static.cpp LIBOMP_USE_ITT_NOTIFY)
   libomp_append(LIBOMP_CXXFILES kmp_debugger.cpp LIBOMP_USE_DEBUGGER)
   libomp_append(LIBOMP_CXXFILES kmp_stats.cpp LIBOMP_STATS)
@@ -103,7 +113,7 @@ endif()
 libomp_append(LIBOMP_CXXFILES kmp_ftn_cdecl.cpp)
 libomp_append(LIBOMP_CXXFILES kmp_ftn_extra.cpp)
 libomp_append(LIBOMP_CXXFILES kmp_version.cpp)
-libomp_append(LIBOMP_CXXFILES ompt-general.cpp IF_TRUE LIBOMP_OMPT_SUPPORT)
+libomp_append(LIBOMP_CXXFILES ompt-general.cpp IF_TRUE LIBBOLT_OMPT_SUPPORT)
 libomp_append(LIBOMP_CXXFILES tsan_annotations.cpp IF_TRUE LIBOMP_TSAN_SUPPORT)
 
 set(LIBOMP_SOURCE_FILES ${LIBOMP_CXXFILES} ${LIBOMP_ASMFILES})
@@ -133,9 +143,23 @@ endif()
 # Add the OpenMP library
 libomp_get_ldflags(LIBOMP_CONFIGURED_LDFLAGS)
 
-add_library(omp ${LIBOMP_LIBRARY_KIND} ${LIBOMP_SOURCE_FILES})
+libomp_get_libflags(LIBOMP_CONFIGURED_LIBFLAGS)
+# Build libomp library. Add LLVMSupport dependency if building in-tree with libomptarget profiling enabled.
+if(OPENMP_STANDALONE_BUILD OR (NOT OPENMP_ENABLE_LIBOMPTARGET_PROFILING))
+  add_library(bolt-omp ${LIBOMP_LIBRARY_KIND} ${LIBOMP_SOURCE_FILES})
+  # Linking command will include libraries in LIBOMP_CONFIGURED_LIBFLAGS
+  target_link_libraries(bolt-omp ${LIBOMP_CONFIGURED_LIBFLAGS} ${CMAKE_DL_LIBS})
+else()
+  add_llvm_library(bolt-omp ${LIBOMP_LIBRARY_KIND} ${LIBOMP_SOURCE_FILES} PARTIAL_SOURCES_INTENDED
+    LINK_LIBS ${LIBOMP_CONFIGURED_LIBFLAGS} ${CMAKE_DL_LIBS}
+    LINK_COMPONENTS Support
+    )
+endif()
+if(${LIBOMP_USE_BUILTIN_ARGOBOTS})
+  add_dependencies(bolt-omp libabt)
+endif()
 
-set_target_properties(omp PROPERTIES
+set_target_properties(bolt-omp PROPERTIES
   PREFIX "" SUFFIX "" OUTPUT_NAME "${LIBOMP_LIB_FILE}"
   LINK_FLAGS "${LIBOMP_CONFIGURED_LDFLAGS}"
   LINKER_LANGUAGE ${LIBOMP_LINKER_LANGUAGE}
@@ -143,49 +167,67 @@ set_target_properties(omp PROPERTIES
 
 # Get the library's location within the build tree for the unit tester
 if(NOT WIN32)
-  get_target_property(LIBOMP_LIBRARY_DIR omp LIBRARY_OUTPUT_DIRECTORY)
+  get_target_property(LIBBOLT_LIBRARY_DIR bolt-omp LIBRARY_OUTPUT_DIRECTORY)
 else()
-  get_target_property(LIBOMP_LIBRARY_DIR omp RUNTIME_OUTPUT_DIRECTORY)
+  get_target_property(LIBBOLT_LIBRARY_DIR bolt-omp RUNTIME_OUTPUT_DIRECTORY)
 endif()
-if(NOT LIBOMP_LIBRARY_DIR)
-  set(LIBOMP_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
-  set(LIBOMP_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
+if(NOT LIBBOLT_LIBRARY_DIR)
+  set(LIBBOLT_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+  set(LIBBOLT_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
 else()
-  set(LIBOMP_LIBRARY_DIR ${LIBOMP_LIBRARY_DIR} PARENT_SCOPE)
+  set(LIBBOLT_LIBRARY_DIR ${LIBBOLT_LIBRARY_DIR} PARENT_SCOPE)
 endif()
+set(LIBBOLT_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
 
 # Add symbolic links to libomp
 if(NOT WIN32)
-  add_custom_command(TARGET omp POST_BUILD
+  add_custom_command(TARGET bolt-omp POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE}
       libgomp${LIBOMP_LIBRARY_SUFFIX}
     COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE}
       libiomp5${LIBOMP_LIBRARY_SUFFIX}
-    WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR}
+    WORKING_DIRECTORY ${LIBBOLT_LIBRARY_DIR}
   )
+  if (NOT ${LIBBOLT_LIB_NAME} STREQUAL libomp)
+    add_custom_command(TARGET bolt-omp POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE}
+        libomp${LIBOMP_LIBRARY_SUFFIX}
+      WORKING_DIRECTORY ${LIBBOLT_LIBRARY_DIR}
+    )
+  endif()
 endif()
 
-# Linking command will include libraries in LIBOMP_CONFIGURED_LIBFLAGS
-libomp_get_libflags(LIBOMP_CONFIGURED_LIBFLAGS)
-target_link_libraries(omp ${LIBOMP_CONFIGURED_LIBFLAGS} ${CMAKE_DL_LIBS})
+# Add symbolic links to Argobots to the runtime path for testing
+if(${LIBOMP_USE_ARGOBOTS})
+  add_custom_command(TARGET bolt-omp POST_BUILD
+    COMMAND ${CMAKE_COMMAND}
+            -D LIBOMP_ARGOBOTS_INSTALL_DIR="${LIBOMP_ARGOBOTS_INSTALL_DIR}"
+            -D LIBBOLT_LIBRARY_DIR="${LIBBOLT_LIBRARY_DIR}"
+            -D LIBBOLT_INCLUDE_DIR="${CMAKE_CURRENT_BINARY_DIR}"
+            -D SHARED_LIBRARY_SUFFIX="${CMAKE_SHARED_LIBRARY_SUFFIX}"
+            -D SHARED_LIBRARY_PREFIX="${CMAKE_SHARED_LIBRARY_PREFIX}"
+            -P "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/LibboltSymlinkArgobots.cmake"
+    WORKING_DIRECTORY ${LIBBOLT_LIBRARY_DIR}
+  )
+endif()
 
 # Create *.inc before compiling any sources
 # objects depend on : .inc files
-add_custom_target(libomp-needed-headers DEPENDS kmp_i18n_id.inc kmp_i18n_default.inc)
-add_dependencies(omp libomp-needed-headers)
+add_custom_target(bolt-libomp-needed-headers DEPENDS kmp_i18n_id.inc kmp_i18n_default.inc)
+add_dependencies(bolt-omp bolt-libomp-needed-headers)
 
 # Windows specific build rules
 if(WIN32)
   configure_file(libomp.rc.var libomp.rc @ONLY)
 
   # Create .def and .rc file before compiling any sources
-  add_custom_target(libomp-needed-windows-files DEPENDS ${LIBOMP_LIB_NAME}.def)
-  add_dependencies(omp libomp-needed-windows-files)
+  add_custom_target(libomp-needed-windows-files DEPENDS ${LIBBOLT_LIB_NAME}.def)
+  add_dependencies(bolt-omp libomp-needed-windows-files)
   # z_Windows_NT-586_asm.asm requires definitions to be sent via command line
-  # It only needs the architecutre macro and OMPT_SUPPORT=0|1
+  # It only needs the architecture macro and OMPT_SUPPORT=0|1
   libomp_append(LIBOMP_MASM_DEFINITIONS "-D_M_IA32" IF_TRUE IA32)
   libomp_append(LIBOMP_MASM_DEFINITIONS "-D_M_AMD64" IF_TRUE INTEL64)
-  libomp_append(LIBOMP_MASM_DEFINITIONS "-DOMPT_SUPPORT" IF_TRUE_1_0 LIBOMP_OMPT_SUPPORT)
+  libomp_append(LIBOMP_MASM_DEFINITIONS "-DOMPT_SUPPORT" IF_TRUE_1_0 LIBBOLT_OMPT_SUPPORT)
   libomp_list_to_string("${LIBOMP_MASM_DEFINITIONS}" LIBOMP_MASM_DEFINITIONS)
   set_property(SOURCE z_Windows_NT-586_asm.asm APPEND_STRING PROPERTY COMPILE_FLAGS " ${LIBOMP_MASM_DEFINITIONS}")
   set_source_files_properties(thirdparty/ittnotify/ittnotify_static.cpp PROPERTIES COMPILE_DEFINITIONS "UNICODE")
@@ -194,36 +236,42 @@ if(WIN32)
   # the import library is "re-linked" to include kmp_import.cpp which prevents
   # linking of both Visual Studio OpenMP and newly built OpenMP
   set_source_files_properties(kmp_import.cpp PROPERTIES COMPILE_FLAGS "${LIBOMP_CONFIGURED_CXXFLAGS}")
-  set(LIBOMP_IMP_LIB_FILE ${LIBOMP_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX})
+  set(LIBOMP_IMP_LIB_FILE ${LIBBOLT_LIB_NAME}${CMAKE_IMPORT_LIBRARY_SUFFIX})
   set(LIBOMP_GENERATED_IMP_LIB_FILENAME ${LIBOMP_LIB_FILE}${CMAKE_STATIC_LIBRARY_SUFFIX})
-  set_target_properties(omp PROPERTIES
-    VERSION ${LIBOMP_VERSION_MAJOR}.${LIBOMP_VERSION_MINOR} # uses /version flag
+  set_target_properties(bolt-omp PROPERTIES
+    VERSION ${LIBBOLT_VERSION_MAJOR}.${LIBBOLT_VERSION_MINOR} # uses /version flag
     IMPORT_PREFIX "" IMPORT_SUFFIX "" # control generated import library name when building omp
     ARCHIVE_OUTPUT_NAME ${LIBOMP_GENERATED_IMP_LIB_FILENAME}
   )
-  # Get generated import library from creating omp
-  get_target_property(LIBOMP_IMPORT_LIB_DIRECTORY omp ARCHIVE_OUTPUT_DIRECTORY)
-  if(LIBOMP_IMPORT_LIB_DIRECTORY)
-    set(LIBOMP_GENERATED_IMP_LIB ${LIBOMP_IMPORT_LIB_DIRECTORY}/${LIBOMP_GENERATED_IMP_LIB_FILENAME})
+
+  if(MSVC)
+    # Get generated import library from creating omp
+    get_target_property(LIBOMP_IMPORT_LIB_DIRECTORY bolt-omp ARCHIVE_OUTPUT_DIRECTORY)
+    if(LIBOMP_IMPORT_LIB_DIRECTORY)
+      set(LIBOMP_GENERATED_IMP_LIB ${LIBOMP_IMPORT_LIB_DIRECTORY}/${LIBOMP_GENERATED_IMP_LIB_FILENAME})
+    else()
+      set(LIBOMP_GENERATED_IMP_LIB ${CMAKE_CURRENT_BINARY_DIR}/${LIBOMP_GENERATED_IMP_LIB_FILENAME})
+    endif()
+    set_source_files_properties(${LIBOMP_GENERATED_IMP_LIB} PROPERTIES GENERATED TRUE EXTERNAL_OBJECT TRUE)
+    # Create new import library that is just the previously created one + kmp_import.cpp
+    add_library(bolt-ompimp STATIC ${LIBOMP_GENERATED_IMP_LIB} kmp_import.cpp)
+    set_target_properties(bolt-ompimp PROPERTIES
+      PREFIX "" SUFFIX "" OUTPUT_NAME "${LIBOMP_IMP_LIB_FILE}"
+      LINKER_LANGUAGE C
+    )
+    add_dependencies(bolt-ompimp bolt-omp) # ensure generated import library is created first
+    set(LIBOMP_IMP_LIB_TARGET bolt-ompimp)
   else()
-    set(LIBOMP_GENERATED_IMP_LIB ${CMAKE_CURRENT_BINARY_DIR}/${LIBOMP_GENERATED_IMP_LIB_FILENAME})
+    set(LIBOMP_IMP_LIB_TARGET bolt-omp)
   endif()
-  set_source_files_properties(${LIBOMP_GENERATED_IMP_LIB} PROPERTIES GENERATED TRUE EXTERNAL_OBJECT TRUE)
-  # Create new import library that is just the previously created one + kmp_import.cpp
-  add_library(ompimp STATIC ${LIBOMP_GENERATED_IMP_LIB} kmp_import.cpp)
-  set_target_properties(ompimp PROPERTIES
-    PREFIX "" SUFFIX "" OUTPUT_NAME "${LIBOMP_IMP_LIB_FILE}"
-    LINKER_LANGUAGE C
-  )
-  add_dependencies(ompimp omp) # ensure generated import library is created first
 
   # Create def file to designate exported functions
   libomp_get_gdflags(LIBOMP_GDFLAGS) # generate-def.pl flags (Windows only)
   libomp_string_to_list("${LIBOMP_GDFLAGS}" LIBOMP_GDFLAGS)
   add_custom_command(
-    OUTPUT  ${LIBOMP_LIB_NAME}.def
+    OUTPUT  ${LIBBOLT_LIB_NAME}.def
     COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/generate-def.pl ${LIBOMP_GDFLAGS}
-      -o ${LIBOMP_LIB_NAME}.def ${CMAKE_CURRENT_SOURCE_DIR}/dllexports
+      -o ${LIBBOLT_LIB_NAME}.def ${CMAKE_CURRENT_SOURCE_DIR}/dllexports
     DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/dllexports ${LIBOMP_TOOLS_DIR}/generate-def.pl
   )
 endif()
@@ -232,7 +280,6 @@ endif()
 # One compilation step creates both omp_lib.mod and omp_lib_kinds.mod
 if(${LIBOMP_FORTRAN_MODULES})
   configure_file(${LIBOMP_INC_DIR}/omp_lib.h.var omp_lib.h @ONLY)
-  configure_file(${LIBOMP_INC_DIR}/omp_lib.f.var omp_lib.f @ONLY)
   configure_file(${LIBOMP_INC_DIR}/omp_lib.f90.var omp_lib.f90 @ONLY)
   # Workaround for gfortran to build modules with the
   # omp_sched_monotonic integer parameter
@@ -244,7 +291,7 @@ if(${LIBOMP_FORTRAN_MODULES})
   if(CMAKE_Fortran_COMPILER_SUPPORTS_F90)
     set(LIBOMP_FORTRAN_SOURCE_FILE omp_lib.f90)
   else()
-    set(LIBOMP_FORTRAN_SOURCE_FILE omp_lib.f)
+    message(FATAL_ERROR "Fortran module build requires Fortran 90 compiler")
   endif()
   add_custom_command(
     OUTPUT omp_lib.mod omp_lib_kinds.mod
@@ -257,26 +304,26 @@ if(${LIBOMP_FORTRAN_MODULES})
 endif()
 
 # Move files to exports/ directory if requested
-if(${LIBOMP_COPY_EXPORTS})
+if(${LIBBOLT_COPY_EXPORTS})
   include(LibompExports)
 endif()
 
 # Micro test rules for after library has been built (cmake/LibompMicroTests.cmake)
 include(LibompMicroTests)
-add_custom_target(libomp-micro-tests)
+add_custom_target(bolt-libomp-micro-tests)
 if(NOT ${MIC} AND NOT CMAKE_CROSSCOMPILING)
-  add_dependencies(libomp-micro-tests libomp-test-touch)
+  add_dependencies(bolt-libomp-micro-tests bolt-libomp-test-touch)
 endif()
 if(NOT WIN32 AND NOT APPLE)
-  add_dependencies(libomp-micro-tests libomp-test-relo)
+  add_dependencies(bolt-libomp-micro-tests bolt-libomp-test-relo)
 endif()
 if(NOT WIN32 AND NOT APPLE)
-  add_dependencies(libomp-micro-tests libomp-test-execstack)
+  add_dependencies(bolt-libomp-micro-tests bolt-libomp-test-execstack)
 endif()
 if(${MIC})
-  add_dependencies(libomp-micro-tests libomp-test-instr)
+  add_dependencies(bolt-libomp-micro-tests bolt-libomp-test-instr)
 endif()
-add_dependencies(libomp-micro-tests libomp-test-deps)
+add_dependencies(bolt-libomp-micro-tests bolt-libomp-test-deps)
 
 # Install rules
 # We want to install libomp in DESTDIR/CMAKE_INSTALL_PREFIX/lib
@@ -288,10 +335,10 @@ else()
   set(LIBOMP_HEADERS_INSTALL_PATH "${OPENMP_INSTALL_LIBDIR}/clang/${CLANG_VERSION}/include")
 endif()
 if(WIN32)
-  install(TARGETS omp RUNTIME DESTINATION bin)
-  install(TARGETS ompimp ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+  install(TARGETS bolt-omp RUNTIME DESTINATION bin)
+  install(TARGETS ${LIBOMP_IMP_LIB_TARGET} ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}")
   # Create aliases (regular copies) of the library for backwards compatibility
-  set(LIBOMP_ALIASES "libiomp5md")
+  set(LIBOMP_ALIASES "libomp;libiomp5md")
   foreach(alias IN LISTS LIBOMP_ALIASES)
     install(CODE "execute_process(COMMAND \"\${CMAKE_COMMAND}\" -E copy \"${LIBOMP_LIB_FILE}\"
       \"${alias}${LIBOMP_LIBRARY_SUFFIX}\" WORKING_DIRECTORY \${CMAKE_INSTALL_PREFIX}/bin)")
@@ -300,7 +347,14 @@ if(WIN32)
   endforeach()
 else()
 
-  install(TARGETS omp ${LIBOMP_INSTALL_KIND} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+  install(TARGETS bolt-omp ${LIBOMP_INSTALL_KIND} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+
+  # Create an alias of libomp for libbolt.
+  if (NOT ${LIBBOLT_LIB_NAME} STREQUAL libomp)
+    install(CODE "execute_process(COMMAND \"\${CMAKE_COMMAND}\" -E create_symlink \"${LIBOMP_LIB_FILE}\"
+      \"libomp${LIBOMP_LIBRARY_SUFFIX}\" WORKING_DIRECTORY
+      \$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/${OPENMP_INSTALL_LIBDIR})")
+  endif()
 
   if(${LIBOMP_INSTALL_ALIASES})
     # Create aliases (symlinks) of the library for backwards compatibility
@@ -317,7 +371,7 @@ install(
   ${CMAKE_CURRENT_BINARY_DIR}/omp.h
   DESTINATION ${LIBOMP_HEADERS_INSTALL_PATH}
 )
-if(${LIBOMP_OMPT_SUPPORT})
+if(${LIBBOLT_OMPT_SUPPORT})
   install(FILES ${CMAKE_CURRENT_BINARY_DIR}/omp-tools.h DESTINATION ${LIBOMP_HEADERS_INSTALL_PATH})
   # install under legacy name ompt.h
   install(FILES ${CMAKE_CURRENT_BINARY_DIR}/omp-tools.h DESTINATION ${LIBOMP_HEADERS_INSTALL_PATH} RENAME ompt.h)
diff --git a/runtime/src/dllexports b/runtime/src/dllexports
index f76619ec0..1c29ca906 100644
--- a/runtime/src/dllexports
+++ b/runtime/src/dllexports
@@ -371,6 +371,7 @@ kmpc_set_defaults                           224
         __kmpc_doacross_fini                264
         __kmpc_taskloop                     266
         __kmpc_critical_with_hint           270
+        __kmpc_taskloop_5                   285
 %endif
 kmpc_aligned_malloc                         265
 kmpc_set_disp_num_buffers                   267
@@ -387,6 +388,7 @@ kmpc_set_disp_num_buffers                   267
         __kmpc_task_allow_completion_event  276
         __kmpc_taskred_init                 277
         __kmpc_taskred_modifier_init        278
+        __kmpc_omp_target_task_alloc        279
 %endif
 
 # User API entry points that have both lower- and upper- case versions for Fortran.
@@ -517,6 +519,8 @@ kmp_set_disp_num_buffers                    890
         __kmpc_set_default_allocator
         __kmpc_get_default_allocator
         __kmpc_alloc
+        __kmpc_calloc
+        __kmpc_realloc
         __kmpc_free
         __kmpc_init_allocator
         __kmpc_destroy_allocator
@@ -533,6 +537,9 @@ kmp_set_disp_num_buffers                    890
     omp_pause_resource_all                  757
     omp_get_supported_active_levels         758
     omp_fulfill_event                       759
+    omp_display_env                         733
+    omp_calloc                              776
+    omp_realloc                             777
 
     omp_null_allocator                     DATA
     omp_default_mem_alloc                  DATA
diff --git a/runtime/src/exports_so.txt b/runtime/src/exports_so.txt
index f7de5fd64..302224181 100644
--- a/runtime/src/exports_so.txt
+++ b/runtime/src/exports_so.txt
@@ -119,5 +119,7 @@ GOMP_4.0 {
 } GOMP_3.0;
 GOMP_4.5 {
 } GOMP_4.0;
+GOMP_5.0 {
+} GOMP_4.5;
 
 # end of file #
diff --git a/runtime/src/extractExternal.cpp b/runtime/src/extractExternal.cpp
index b3e55b555..f512ecb2b 100644
--- a/runtime/src/extractExternal.cpp
+++ b/runtime/src/extractExternal.cpp
@@ -57,7 +57,7 @@ class _rstream : public istrstream {
   ~_rstream() { delete[] buf; }
 };
 
-// A stream encapuslating the content of a file or the content of a string,
+// A stream encapsulating the content of a file or the content of a string,
 // overriding the >> operator to read various integer types in binary form,
 // as well as a symbol table entry.
 class rstream : public _rstream {
diff --git a/runtime/src/i18n/en_US.txt b/runtime/src/i18n/en_US.txt
index 822f73c0e..d9fac3288 100644
--- a/runtime/src/i18n/en_US.txt
+++ b/runtime/src/i18n/en_US.txt
@@ -98,10 +98,10 @@ ThreadIDsNotUnique           "thread ids not unique"
 UsingPthread                 "using pthread info"
 LegacyApicIDsNotUnique       "legacy APIC ids not unique"
 x2ApicIDsNotUnique           "x2APIC ids not unique"
-DisplayEnvBegin		     "OPENMP DISPLAY ENVIRONMENT BEGIN"
-DisplayEnvEnd		     "OPENMP DISPLAY ENVIRONMENT END"
-Device			     "[device]"
-Host			     "[host]"
+DisplayEnvBegin              "OPENMP DISPLAY ENVIRONMENT BEGIN"
+DisplayEnvEnd                "OPENMP DISPLAY ENVIRONMENT END"
+Device                       "[device]"
+Host                         "[host]"
 Tile                         "tile"
 
 
@@ -293,7 +293,7 @@ AffUseGlobCpuid              "%1$s: Affinity capable, using global cpuid info"
 AffCapableUseFlat            "%1$s: Affinity capable, using default \"flat\" topology"
 AffNotCapableUseLocCpuid     "%1$s: Affinity not capable, using local cpuid info"
 AffNotCapableUseCpuinfo      "%1$s: Affinity not capable, using cpuinfo file"
-AffFlatTopology              "%1$s: Affinity not capable, assumming \"flat\" topology"
+AffFlatTopology              "%1$s: Affinity not capable, assuming \"flat\" topology"
 InitOSProcSetRespect         "%1$s: Initial OS proc set respected: %2$s"
 InitOSProcSetNotRespect      "%1$s: Initial OS proc set not respected: %2$s"
 AvailableOSProc              "%1$s: %2$d available OS procs"
@@ -324,7 +324,7 @@ WrongMessageCatalog          "Incompatible message catalog \"%1$s\": Version \"%
 StgIgnored                   "%1$s: ignored because %2$s has been defined"
                                  # %1, -- name of ignored variable, %2 -- name of variable with higher priority.
 OBSOLETE                     "%1$s: overrides %3$s specified before"
-                                 # %1, %2 -- name and value of the overriding variable, %3 -- name of overriden variable.
+                                 # %1, %2 -- name and value of the overriding variable, %3 -- name of overridden variable.
 AffTilesNoHWLOC              "%1$s: Tiles are only supported if KMP_TOPOLOGY_METHOD=hwloc, using granularity=package instead"
 AffTilesNoTiles              "%1$s: Tiles requested but were not detected on this HW, using granularity=package instead"
 TopologyExtraTile            "%1$s: %2$d packages x %3$d tiles/pkg x %4$d cores/tile x %5$d threads/core (%6$d total cores)"
@@ -372,7 +372,7 @@ AffParseFilename             "%1$s: parsing %2$s."
 MsgExiting                   "%1$s - exiting."
 IncompatibleLibrary          "Incompatible %1$s library with version %2$s found."
 IttFunctionError             "ittnotify: Function %1$s failed:"
-IttUnknownError              "ittnofify: Error #%1$d."
+IttUnknownError              "ittnotify: Error #%1$d."
 EnvMiddleWarn                "%1$s must be set prior to first parallel region or certain API calls; ignored."
 CnsLockNotDestroyed          "Lock initialized at %1$s(%2$d) was not destroyed"
                                  # %1, %2, %3, %4 -- file, line, func, col
@@ -402,6 +402,7 @@ AffHWSubsetUnsupported       "KMP_HW_SUBSET ignored: unsupported architecture."
 AffHWSubsetManyCores         "KMP_HW_SUBSET ignored: too many cores requested."
 SyntaxErrorUsing             "%1$s: syntax error, using %2$s."
 AdaptiveNotSupported         "%1$s: Adaptive locks are not supported; using queuing."
+LockTypeNotSupported         "%1$s: Specified locks are not supported; using queuing."
 EnvSyntaxError               "%1$s: Invalid symbols found. Check the value \"%2$s\"."
 EnvSpacesNotAllowed          "%1$s: Spaces between digits are not allowed \"%2$s\"."
 BoundToOSProcSet             "%1$s: pid %2$d tid %3$d thread %4$d bound to OS proc set %5$s"
@@ -417,6 +418,8 @@ AffUsingHwloc                "%1$s: Affinity capable, using hwloc."
 AffIgnoringHwloc             "%1$s: Ignoring hwloc mechanism."
 AffHwlocErrorOccurred        "%1$s: Hwloc failed in %2$s. Relying on internal affinity mechanisms."
 EnvSerialWarn                "%1$s must be set prior to OpenMP runtime library initialization; ignored."
+EnvMwaitWarn                 "You have enabled the use of umonitor/umwait. If the CPU doesn't have that enabled "
+                             "you'll get an illegal instruction exception."
 EnvVarDeprecated             "%1$s variable deprecated, please use %2$s instead."
 RedMethodNotSupported        "KMP_FORCE_REDUCTION: %1$s method is not supported; using critical."
 AffHWSubsetNoHWLOC           "KMP_HW_SUBSET ignored: unsupported item requested for non-HWLOC topology method (KMP_TOPOLOGY_METHOD)"
@@ -426,6 +429,7 @@ AffHWSubsetManyProcs         "KMP_HW_SUBSET ignored: too many Procs requested."
 HierSchedInvalid             "Hierarchy ignored: unsupported level: %1$s."
 AffFormatDefault             "OMP: pid %1$s tid %2$s thread %3$s bound to OS proc set {%4$s}"
 APIDeprecated                "%1$s routine deprecated, please use %2$s instead."
+GompFeatureNotSupported      "libgomp compatibility layer does not support OpenMP feature: %1$s"
 
 # --------------------------------------------------------------------------------------------------
 -*- HINTS -*-
@@ -439,7 +443,7 @@ SubmitBugReport              "Please submit a bug report with this message, comp
                              "compiler and operating system versions. Faster response will be "
                              "obtained by including all program sources. For information on "
                              "submitting this issue, please see "
-                             "https://bugs.llvm.org/."
+                             "https://www.bolt-omp.org/."
 OBSOLETE                     "Check NLSPATH environment variable, its value is \"%1$s\"."
 ChangeStackLimit             "Please try changing the shell stack limit or adjusting the "
                              "OMP_STACKSIZE environment variable."
diff --git a/runtime/src/include/omp-tools.h.var b/runtime/src/include/omp-tools.h.var
index 190b538fa..961e767c6 100644
--- a/runtime/src/include/omp-tools.h.var
+++ b/runtime/src/include/omp-tools.h.var
@@ -20,6 +20,16 @@
 #include <stdint.h>
 #include <stddef.h>
 
+#ifdef DEPRECATION_WARNINGS 
+# ifdef __cplusplus
+# define DEPRECATED_51 [[deprecated("as of 5.1")]]
+# else
+# define DEPRECATED_51 __attribute__((deprecated("as of 5.1")))
+#endif
+#else
+#define DEPRECATED_51
+#endif
+
 /*****************************************************************************
  * iteration macros
  *****************************************************************************/
@@ -133,7 +143,7 @@
                                                                                                                          \
     macro (ompt_callback_work,              ompt_callback_work_t,              20) /* task at work begin or end       */ \
                                                                                                                          \
-    macro (ompt_callback_master,            ompt_callback_master_t,            21) /* task at master begin or end     */ \
+    macro (ompt_callback_masked,            ompt_callback_masked_t,            21) /* task at masked begin or end     */ \
                                                                                                                          \
     macro (ompt_callback_target_map,        ompt_callback_target_map_t,        22) /* target map                      */ \
                                                                                                                          \
@@ -153,7 +163,12 @@
                                                                                                                          \
     macro (ompt_callback_reduction,         ompt_callback_sync_region_t,       31) /* reduction                       */ \
                                                                                                                          \
-    macro (ompt_callback_dispatch,          ompt_callback_dispatch_t,          32) /* dispatch of work                */
+    macro (ompt_callback_dispatch,          ompt_callback_dispatch_t,          32) /* dispatch of work                */ \
+    macro (ompt_callback_target_emi,        ompt_callback_target_emi_t,        33) /* target                          */ \
+    macro (ompt_callback_target_data_op_emi,ompt_callback_target_data_op_emi_t,34) /* target data op                  */ \
+    macro (ompt_callback_target_submit_emi, ompt_callback_target_submit_emi_t, 35) /* target submit                   */ \
+    macro (ompt_callback_target_map_emi,    ompt_callback_target_map_emi_t,    36) /* target map                      */ \
+    macro (ompt_callback_error,             ompt_callback_error_t,             37) /* error                           */
 
 /*****************************************************************************
  * implementation specific types
@@ -190,7 +205,8 @@ typedef enum ompt_callbacks_t {
   ompt_callback_dependences              = 18,
   ompt_callback_task_dependence          = 19,
   ompt_callback_work                     = 20,
-  ompt_callback_master                   = 21,
+  ompt_callback_master     DEPRECATED_51 = 21,
+  ompt_callback_masked                   = 21,
   ompt_callback_target_map               = 22,
   ompt_callback_sync_region              = 23,
   ompt_callback_lock_init                = 24,
@@ -201,7 +217,12 @@ typedef enum ompt_callbacks_t {
   ompt_callback_flush                    = 29,
   ompt_callback_cancel                   = 30,
   ompt_callback_reduction                = 31,
-  ompt_callback_dispatch                 = 32
+  ompt_callback_dispatch                 = 32,
+  ompt_callback_target_emi               = 33,
+  ompt_callback_target_data_op_emi       = 34,
+  ompt_callback_target_submit_emi        = 35,
+  ompt_callback_target_map_emi           = 36,
+  ompt_callback_error                    = 37
 } ompt_callbacks_t;
 
 typedef enum ompt_record_t {
@@ -239,7 +260,8 @@ typedef enum ompt_thread_t {
 
 typedef enum ompt_scope_endpoint_t {
   ompt_scope_begin                    = 1,
-  ompt_scope_end                      = 2
+  ompt_scope_end                      = 2,
+  ompt_scope_beginend                 = 3
 } ompt_scope_endpoint_t;
 
 typedef enum ompt_dispatch_t {
@@ -248,22 +270,29 @@ typedef enum ompt_dispatch_t {
 } ompt_dispatch_t;
 
 typedef enum ompt_sync_region_t {
-  ompt_sync_region_barrier                = 1,
-  ompt_sync_region_barrier_implicit       = 2,
+  ompt_sync_region_barrier                DEPRECATED_51 = 1,
+  ompt_sync_region_barrier_implicit       DEPRECATED_51 = 2,
   ompt_sync_region_barrier_explicit       = 3,
   ompt_sync_region_barrier_implementation = 4,
   ompt_sync_region_taskwait               = 5,
   ompt_sync_region_taskgroup              = 6,
-  ompt_sync_region_reduction              = 7
+  ompt_sync_region_reduction              = 7,
+  ompt_sync_region_barrier_implicit_workshare = 8,
+  ompt_sync_region_barrier_implicit_parallel = 9,
+  ompt_sync_region_barrier_teams = 10
 } ompt_sync_region_t;
 
 typedef enum ompt_target_data_op_t {
-  ompt_target_data_alloc                = 1,
-  ompt_target_data_transfer_to_device   = 2,
-  ompt_target_data_transfer_from_device = 3,
-  ompt_target_data_delete               = 4,
-  ompt_target_data_associate            = 5,
-  ompt_target_data_disassociate         = 6
+  ompt_target_data_alloc                      = 1,
+  ompt_target_data_transfer_to_device         = 2,
+  ompt_target_data_transfer_from_device       = 3,
+  ompt_target_data_delete                     = 4,
+  ompt_target_data_associate                  = 5,
+  ompt_target_data_disassociate               = 6,
+  ompt_target_data_alloc_async                = 17,
+  ompt_target_data_transfer_to_device_async   = 18,
+  ompt_target_data_transfer_from_device_async = 19,
+  ompt_target_data_delete_async               = 20
 } ompt_target_data_op_t;
 
 typedef enum ompt_work_t {
@@ -273,7 +302,8 @@ typedef enum ompt_work_t {
   ompt_work_single_other       = 4,
   ompt_work_workshare          = 5,
   ompt_work_distribute         = 6,
-  ompt_work_taskloop           = 7
+  ompt_work_taskloop           = 7,
+  ompt_work_scope              = 8
 } ompt_work_t;
 
 typedef enum ompt_mutex_t {
@@ -302,6 +332,7 @@ typedef enum ompt_task_flag_t {
   ompt_task_implicit                  = 0x00000002,
   ompt_task_explicit                  = 0x00000004,
   ompt_task_target                    = 0x00000008,
+  ompt_task_taskwait                  = 0x00000010,
   ompt_task_undeferred                = 0x08000000,
   ompt_task_untied                    = 0x10000000,
   ompt_task_final                     = 0x20000000,
@@ -316,14 +347,19 @@ typedef enum ompt_task_status_t {
   ompt_task_detach        = 4,
   ompt_task_early_fulfill = 5,
   ompt_task_late_fulfill  = 6,
-  ompt_task_switch        = 7
+  ompt_task_switch        = 7,
+  ompt_taskwait_complete  = 8
 } ompt_task_status_t;
 
 typedef enum ompt_target_t {
   ompt_target                         = 1,
   ompt_target_enter_data              = 2,
   ompt_target_exit_data               = 3,
-  ompt_target_update                  = 4
+  ompt_target_update                  = 4,
+  ompt_target_nowait                  = 9,
+  ompt_target_enter_data_nowait       = 10,
+  ompt_target_exit_data_nowait        = 11,
+  ompt_target_update_nowait           = 12
 } ompt_target_t;
 
 typedef enum ompt_parallel_flag_t {
@@ -348,9 +384,15 @@ typedef enum ompt_dependence_type_t {
   ompt_dependence_type_inout           = 3,
   ompt_dependence_type_mutexinoutset   = 4,
   ompt_dependence_type_source          = 5,
-  ompt_dependence_type_sink            = 6
+  ompt_dependence_type_sink            = 6,
+  ompt_dependence_type_inoutset        = 7
 } ompt_dependence_type_t;
 
+typedef enum ompt_severity_t {
+  ompt_warning                         = 1,
+  ompt_fatal                           = 2
+} ompt_severity_t;
+
 typedef enum ompt_cancel_flag_t {
   ompt_cancel_parallel       = 0x01,
   ompt_cancel_sections       = 0x02,
@@ -378,11 +420,13 @@ typedef enum ompt_state_t {
   ompt_state_work_parallel                    = 0x001,
   ompt_state_work_reduction                   = 0x002,
 
-  ompt_state_wait_barrier                     = 0x010,
+  ompt_state_wait_barrier                     DEPRECATED_51 = 0x010,
   ompt_state_wait_barrier_implicit_parallel   = 0x011,
   ompt_state_wait_barrier_implicit_workshare  = 0x012,
-  ompt_state_wait_barrier_implicit            = 0x013,
+  ompt_state_wait_barrier_implicit            DEPRECATED_51 = 0x013,
   ompt_state_wait_barrier_explicit            = 0x014,
+  ompt_state_wait_barrier_implementation      = 0x015,
+  ompt_state_wait_barrier_teams               = 0x016,
 
   ompt_state_wait_taskwait                    = 0x020,
   ompt_state_wait_taskgroup                   = 0x021,
@@ -799,19 +843,21 @@ typedef struct ompt_record_implicit_task_t {
   int flags;
 } ompt_record_implicit_task_t;
 
-typedef void (*ompt_callback_master_t) (
+typedef void (*ompt_callback_masked_t) (
   ompt_scope_endpoint_t endpoint,
   ompt_data_t *parallel_data,
   ompt_data_t *task_data,
   const void *codeptr_ra
 );
 
-typedef struct ompt_record_master_t {
+typedef ompt_callback_masked_t ompt_callback_master_t DEPRECATED_51;
+
+typedef struct ompt_record_masked_t {
   ompt_scope_endpoint_t endpoint;
   ompt_id_t parallel_id;
   ompt_id_t task_id;
   const void *codeptr_ra;
-} ompt_record_master_t;
+} ompt_record_masked_t;
 
 typedef void (*ompt_callback_sync_region_t) (
   ompt_sync_region_t kind,
@@ -918,6 +964,20 @@ typedef void (*ompt_callback_device_unload_t) (
   uint64_t module_id
 );
 
+typedef void (*ompt_callback_target_data_op_emi_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *target_task_data,
+  ompt_data_t *target_data,
+  ompt_id_t *host_op_id,
+  ompt_target_data_op_t optype,
+  void *src_addr,
+  int src_device_num,
+  void *dest_addr,
+  int dest_device_num,
+  size_t bytes,
+  const void *codeptr_ra
+);
+
 typedef void (*ompt_callback_target_data_op_t) (
   ompt_id_t target_id,
   ompt_id_t host_op_id,
@@ -942,6 +1002,16 @@ typedef struct ompt_record_target_data_op_t {
   const void *codeptr_ra;
 } ompt_record_target_data_op_t;
 
+typedef void (*ompt_callback_target_emi_t) (
+  ompt_target_t kind,
+  ompt_scope_endpoint_t endpoint,
+  int device_num,
+  ompt_data_t *task_data,
+  ompt_data_t *target_task_data,
+  ompt_data_t *target_data,
+  const void *codeptr_ra
+);
+
 typedef void (*ompt_callback_target_t) (
   ompt_target_t kind,
   ompt_scope_endpoint_t endpoint,
@@ -960,6 +1030,16 @@ typedef struct ompt_record_target_t {
   const void *codeptr_ra;
 } ompt_record_target_t;
 
+typedef void (*ompt_callback_target_map_emi_t) (
+  ompt_data_t *target_data,
+  unsigned int nitems,
+  void **host_addr,
+  void **device_addr,
+  size_t *bytes,
+  unsigned int *mapping_flags,
+  const void *codeptr_ra
+);
+
 typedef void (*ompt_callback_target_map_t) (
   ompt_id_t target_id,
   unsigned int nitems,
@@ -980,6 +1060,13 @@ typedef struct ompt_record_target_map_t {
   const void *codeptr_ra;
 } ompt_record_target_map_t;
 
+typedef void (*ompt_callback_target_submit_emi_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *target_data,
+  ompt_id_t *host_op_id,
+  unsigned int requested_num_teams
+);
+
 typedef void (*ompt_callback_target_submit_t) (
   ompt_id_t target_id,
   ompt_id_t host_op_id,
@@ -1006,6 +1093,12 @@ typedef struct ompt_record_control_tool_t {
   const void *codeptr_ra;
 } ompt_record_control_tool_t;
 
+typedef void (*ompt_callback_error_t) (
+  ompt_severity_t severity,
+  const char *message, size_t length,
+  const void *codeptr_ra
+);
+
 typedef struct ompd_address_t {
   ompd_seg_t segment;
   ompd_addr_t address;
@@ -1049,7 +1142,7 @@ typedef struct ompt_record_ompt_t {
     ompt_record_task_dependence_t task_dependence;
     ompt_record_task_schedule_t task_schedule;
     ompt_record_implicit_task_t implicit_task;
-    ompt_record_master_t master;
+    ompt_record_masked_t masked;
     ompt_record_sync_region_t sync_region;
     ompt_record_mutex_acquire_t mutex_acquire;
     ompt_record_mutex_t mutex;
diff --git a/runtime/src/include/omp.h.var b/runtime/src/include/omp.h.var
index 2246e7012..1a50578fe 100644
--- a/runtime/src/include/omp.h.var
+++ b/runtime/src/include/omp.h.var
@@ -18,9 +18,49 @@
 #   include <stdlib.h>
 #   include <stdint.h>
 
-#   define KMP_VERSION_MAJOR    @LIBOMP_VERSION_MAJOR@
-#   define KMP_VERSION_MINOR    @LIBOMP_VERSION_MINOR@
-#   define KMP_VERSION_BUILD    @LIBOMP_VERSION_BUILD@
+/*
+ * BOLT_VERSION is the version string. BOLT_NUMVERSION is the
+ * numeric version that can be used in numeric comparisons.
+ *
+ * BOLT_VERSION uses the following format:
+ * Version: [MAJ].[MIN].[REV][EXT][EXT_NUMBER]
+ * Example: 1.0.7rc1 has
+ *          MAJ = 1
+ *          MIN = 0
+ *          REV = 7
+ *          EXT = rc
+ *          EXT_NUMBER = 1
+ *
+ * BOLT_NUMVERSION will convert EXT to a format number:
+ *          ALPHA (a) = 0
+ *          BETA (b)  = 1
+ *          RC (rc)   = 2
+ *          PATCH (p) = 3
+ * Regular releases are treated as patch 0
+ *
+ * Numeric version will have 1 digit for MAJ, 2 digits for MIN, 2
+ * digits for REV, 1 digit for EXT and 2 digits for EXT_NUMBER. So,
+ * 1.0.7rc1 will have the numeric version 10007201.
+ */
+#   define BOLT_VERSION "@BOLT_VERSION@"
+#   define BOLT_NUMVERSION @BOLT_NUMVERSION@
+#   define BOLT_RELEASE_TYPE_ALPHA  0
+#   define BOLT_RELEASE_TYPE_BETA   1
+#   define BOLT_RELEASE_TYPE_RC     2
+#   define BOLT_RELEASE_TYPE_PATCH  3
+#   define BOLT_CALC_VERSION(MAJOR, MINOR, REVISION, TYPE, PATCH) \
+        (((MAJOR) * 10000000) + ((MINOR) * 100000) + ((REVISION) * 1000) + ((TYPE) * 100) + (PATCH))
+#   define BOLT_RELEASE_DATE    "@BOLT_RELEASE_DATE@"
+#   define BOLT_COMPILER_CC     "@BOLT_COMPILER_CC@ @BOLT_COMPILER_CFLAGS@"
+#   define BOLT_COMPILER_CXX    "@BOLT_COMPILER_CXX@ @BOLT_COMPILER_CXXFLAGS@"
+
+#   define BOLT_THREAD_ARGOBOTS  1
+#   define BOLT_THREAD_NATIVE    0
+#   define BOLT_THREAD           @BOLT_THREAD_TYPE@
+
+#   define KMP_VERSION_MAJOR    @LIBBOLT_VERSION_MAJOR@
+#   define KMP_VERSION_MINOR    @LIBBOLT_VERSION_MINOR@
+#   define KMP_VERSION_BUILD    @LIBBOLT_VERSION_BUILD@
 #   define KMP_BUILD_DATE       "@LIBOMP_BUILD_DATE@"
 
 #   ifdef __cplusplus
@@ -152,6 +192,85 @@
     extern int   __KAI_KMPC_CONVENTION  omp_get_device_num (void);
     typedef void * omp_depend_t;
 
+    /* OpenMP 5.1 interop */
+    typedef intptr_t omp_intptr_t;
+
+    /* 0..omp_get_num_interop_properties()-1 are reserved for implementation-defined properties */
+    typedef enum omp_interop_property {
+        omp_ipr_fr_id = -1,
+        omp_ipr_fr_name = -2,
+        omp_ipr_vendor = -3,
+        omp_ipr_vendor_name = -4,
+        omp_ipr_device_num = -5,
+        omp_ipr_platform = -6,
+        omp_ipr_device = -7,
+        omp_ipr_device_context = -8,
+        omp_ipr_targetsync = -9,
+        omp_ipr_first = -9
+    } omp_interop_property_t;
+
+    #define omp_interop_none 0
+
+    typedef enum omp_interop_rc {
+        omp_irc_no_value = 1,
+        omp_irc_success = 0,
+        omp_irc_empty = -1,
+        omp_irc_out_of_range = -2,
+        omp_irc_type_int = -3,
+        omp_irc_type_ptr = -4,
+        omp_irc_type_str = -5,
+        omp_irc_other = -6
+    } omp_interop_rc_t;
+
+    typedef void * omp_interop_t;
+
+    /*!
+     * The `omp_get_num_interop_properties` routine retrieves the number of implementation-defined properties available for an `omp_interop_t` object.
+     */
+    extern int          __KAI_KMPC_CONVENTION  omp_get_num_interop_properties(const omp_interop_t);
+    /*!
+     * The `omp_get_interop_int` routine retrieves an integer property from an `omp_interop_t` object.
+     */
+    extern omp_intptr_t __KAI_KMPC_CONVENTION  omp_get_interop_int(const omp_interop_t, omp_interop_property_t, int *);
+    /*!
+     * The `omp_get_interop_ptr` routine retrieves a pointer property from an `omp_interop_t` object.
+     */
+    extern void *       __KAI_KMPC_CONVENTION  omp_get_interop_ptr(const omp_interop_t, omp_interop_property_t, int *);
+    /*!
+     * The `omp_get_interop_str` routine retrieves a string property from an `omp_interop_t` object.
+     */
+    extern const char * __KAI_KMPC_CONVENTION  omp_get_interop_str(const omp_interop_t, omp_interop_property_t, int *);
+    /*!
+     * The `omp_get_interop_name` routine retrieves a property name from an `omp_interop_t` object.
+     */
+    extern const char * __KAI_KMPC_CONVENTION  omp_get_interop_name(const omp_interop_t, omp_interop_property_t);
+    /*!
+     * The `omp_get_interop_type_desc` routine retrieves a description of the type of a property associated with an `omp_interop_t` object.
+     */
+    extern const char * __KAI_KMPC_CONVENTION  omp_get_interop_type_desc(const omp_interop_t, omp_interop_property_t);
+    /*!
+     * The `omp_get_interop_rc_desc` routine retrieves a description of the return code associated with an `omp_interop_t` object.
+     */
+    extern const char * __KAI_KMPC_CONVENTION  omp_get_interop_rc_desc(const omp_interop_rc_t, omp_interop_rc_t);
+
+    /* OpenMP 5.1 device memory routines */
+
+    /*!
+     * The `omp_target_memcpy_async` routine asynchronously performs a copy between any combination of host and device pointers.
+     */
+    extern int    __KAI_KMPC_CONVENTION  omp_target_memcpy_async(void *, const void *, size_t, size_t, size_t, int,
+                                             int, int, omp_depend_t *);
+    /*!
+     * The `omp_target_memcpy_rect_async` routine asynchronously performs a copy between any combination of host and device pointers.
+     */
+    extern int    __KAI_KMPC_CONVENTION  omp_target_memcpy_rect_async(void *, const void *, size_t, int, const size_t *,
+                                             const size_t *, const size_t *, const size_t *, const size_t *, int, int,
+                                             int, omp_depend_t *);
+    /*!
+     * The `omp_get_mapped_ptr` routine returns the device pointer that is associated with a host pointer for a given device.
+     */
+    extern void * __KAI_KMPC_CONVENTION  omp_get_mapped_ptr(const void *, int);
+
     /* kmp API functions */
     extern int    __KAI_KMPC_CONVENTION  kmp_get_stacksize          (void);
     extern void   __KAI_KMPC_CONVENTION  kmp_set_stacksize          (int);
@@ -228,37 +347,38 @@
     typedef uintptr_t omp_uintptr_t;
 
     typedef enum {
-        OMP_ATK_THREADMODEL = 1,
-        OMP_ATK_ALIGNMENT = 2,
-        OMP_ATK_ACCESS = 3,
-        OMP_ATK_POOL_SIZE = 4,
-        OMP_ATK_FALLBACK = 5,
-        OMP_ATK_FB_DATA = 6,
-        OMP_ATK_PINNED = 7,
-        OMP_ATK_PARTITION = 8
+        omp_atk_sync_hint = 1,
+        omp_atk_alignment = 2,
+        omp_atk_access = 3,
+        omp_atk_pool_size = 4,
+        omp_atk_fallback = 5,
+        omp_atk_fb_data = 6,
+        omp_atk_pinned = 7,
+        omp_atk_partition = 8
     } omp_alloctrait_key_t;
 
     typedef enum {
-        OMP_ATV_FALSE = 0,
-        OMP_ATV_TRUE = 1,
-        OMP_ATV_DEFAULT = 2,
-        OMP_ATV_CONTENDED = 3,
-        OMP_ATV_UNCONTENDED = 4,
-        OMP_ATV_SEQUENTIAL = 5,
-        OMP_ATV_PRIVATE = 6,
-        OMP_ATV_ALL = 7,
-        OMP_ATV_THREAD = 8,
-        OMP_ATV_PTEAM = 9,
-        OMP_ATV_CGROUP = 10,
-        OMP_ATV_DEFAULT_MEM_FB = 11,
-        OMP_ATV_NULL_FB = 12,
-        OMP_ATV_ABORT_FB = 13,
-        OMP_ATV_ALLOCATOR_FB = 14,
-        OMP_ATV_ENVIRONMENT = 15,
-        OMP_ATV_NEAREST = 16,
-        OMP_ATV_BLOCKED = 17,
-        OMP_ATV_INTERLEAVED = 18
+        omp_atv_false = 0,
+        omp_atv_true = 1,
+        omp_atv_contended = 3,
+        omp_atv_uncontended = 4,
+        omp_atv_serialized = 5,
+        omp_atv_sequential = omp_atv_serialized, // (deprecated)
+        omp_atv_private = 6,
+        omp_atv_all = 7,
+        omp_atv_thread = 8,
+        omp_atv_pteam = 9,
+        omp_atv_cgroup = 10,
+        omp_atv_default_mem_fb = 11,
+        omp_atv_null_fb = 12,
+        omp_atv_abort_fb = 13,
+        omp_atv_allocator_fb = 14,
+        omp_atv_environment = 15,
+        omp_atv_nearest = 16,
+        omp_atv_blocked = 17,
+        omp_atv_interleaved = 18
     } omp_alloctrait_value_t;
+    #define omp_atv_default ((omp_uintptr_t)-1)
 
     typedef struct {
         omp_alloctrait_key_t key;
@@ -323,9 +443,16 @@
     extern omp_allocator_handle_t __KAI_KMPC_CONVENTION omp_get_default_allocator(void);
 #   ifdef __cplusplus
     extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a = omp_null_allocator);
+    extern void *__KAI_KMPC_CONVENTION omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t a = omp_null_allocator);
+    extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size,
+                                                   omp_allocator_handle_t allocator = omp_null_allocator,
+                                                   omp_allocator_handle_t free_allocator = omp_null_allocator);
     extern void __KAI_KMPC_CONVENTION omp_free(void * ptr, omp_allocator_handle_t a = omp_null_allocator);
 #   else
     extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a);
+    extern void *__KAI_KMPC_CONVENTION omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t a);
+    extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
+                                                   omp_allocator_handle_t free_allocator);
     extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a);
 #   endif
 
@@ -355,6 +482,9 @@
 
     extern int __KAI_KMPC_CONVENTION omp_get_supported_active_levels(void);
 
+    /* OpenMP 5.1 Display Environment */
+    extern void omp_display_env(int verbose);
+
 #   undef __KAI_KMPC_CONVENTION
 #   undef __KMP_IMP
 
diff --git a/runtime/src/include/omp_lib.f.var b/runtime/src/include/omp_lib.f.var
deleted file mode 100644
index 19f14d75f..000000000
--- a/runtime/src/include/omp_lib.f.var
+++ /dev/null
@@ -1,1047 +0,0 @@
-! include/omp_lib.f.var
-
-!
-!//===----------------------------------------------------------------------===//
-!//
-!// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-!// See https://llvm.org/LICENSE.txt for license information.
-!// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-!//
-!//===----------------------------------------------------------------------===//
-!
-
-!***
-!*** Some of the directives for the following routine extend past column 72,
-!*** so process this file in 132-column mode.
-!***
-
-!dec$ fixedformlinesize:132
-
-      module omp_lib_kinds
-
-        integer, parameter :: omp_integer_kind       = 4
-        integer, parameter :: omp_logical_kind       = 4
-        integer, parameter :: omp_real_kind          = 4
-        integer, parameter :: omp_lock_kind          = int_ptr_kind()
-        integer, parameter :: omp_nest_lock_kind     = int_ptr_kind()
-        integer, parameter :: omp_sched_kind         = omp_integer_kind
-        integer, parameter :: omp_proc_bind_kind     = omp_integer_kind
-        integer, parameter :: kmp_pointer_kind       = int_ptr_kind()
-        integer, parameter :: kmp_size_t_kind        = int_ptr_kind()
-        integer, parameter :: kmp_affinity_mask_kind = int_ptr_kind()
-        integer, parameter :: kmp_cancel_kind        = omp_integer_kind
-        integer, parameter :: omp_lock_hint_kind     = omp_integer_kind
-        integer, parameter :: omp_control_tool_kind  = omp_integer_kind
-        integer, parameter :: omp_control_tool_result_kind = omp_integer_kind
-        integer, parameter :: omp_allocator_handle_kind = int_ptr_kind()
-        integer, parameter :: omp_memspace_handle_kind = int_ptr_kind()
-        integer, parameter :: omp_alloctrait_key_kind = omp_integer_kind
-        integer, parameter :: omp_alloctrait_val_kind = int_ptr_kind()
-
-        type omp_alloctrait
-          integer(kind=omp_alloctrait_key_kind) key
-          integer(kind=omp_alloctrait_val_kind) value
-        end type omp_alloctrait
-
-        integer, parameter :: omp_pause_resource_kind = omp_integer_kind
-        integer, parameter :: omp_depend_kind = int_ptr_kind()
-        integer, parameter :: omp_event_handle_kind = int_ptr_kind()
-
-      end module omp_lib_kinds
-
-      module omp_lib
-
-        use omp_lib_kinds
-
-        integer (kind=omp_integer_kind), parameter :: kmp_version_major = @LIBOMP_VERSION_MAJOR@
-        integer (kind=omp_integer_kind), parameter :: kmp_version_minor = @LIBOMP_VERSION_MINOR@
-        integer (kind=omp_integer_kind), parameter :: kmp_version_build = @LIBOMP_VERSION_BUILD@
-        character(*), parameter :: kmp_build_date    = '@LIBOMP_BUILD_DATE@'
-        integer (kind=omp_integer_kind), parameter :: openmp_version    = @LIBOMP_OMP_YEAR_MONTH@
-
-        integer(kind=omp_sched_kind), parameter :: omp_sched_static  = 1
-        integer(kind=omp_sched_kind), parameter :: omp_sched_dynamic = 2
-        integer(kind=omp_sched_kind), parameter :: omp_sched_guided  = 3
-        integer(kind=omp_sched_kind), parameter :: omp_sched_auto    = 4
-        integer(kind=omp_sched_kind), parameter :: omp_sched_monotonic = Z'80000000'
-
-        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_false = 0
-        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_true = 1
-        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_master = 2
-        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_close = 3
-        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_spread = 4
-
-        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_parallel = 1
-        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_loop = 2
-        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_sections = 3
-        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_taskgroup = 4
-
-        integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_none           = 0
-        integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_uncontended    = 1
-        integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_contended      = 2
-        integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_nonspeculative = 4
-        integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_speculative    = 8
-        integer (kind=omp_lock_hint_kind), parameter :: kmp_lock_hint_hle            = 65536
-        integer (kind=omp_lock_hint_kind), parameter :: kmp_lock_hint_rtm            = 131072
-        integer (kind=omp_lock_hint_kind), parameter :: kmp_lock_hint_adaptive       = 262144
-
-        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_threadmodel = 1
-        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_alignment = 2
-        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_access = 3
-        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_pool_size = 4
-        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_fallback = 5
-        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_fb_data = 6
-        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_pinned = 7
-        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_partition = 8
-
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_false = 0
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_true = 1
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_default = 2
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_contended = 3
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_uncontended = 4
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_sequential = 5
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_private = 6
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_all = 7
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_thread = 8
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_pteam = 9
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_cgroup = 10
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_default_mem_fb = 11
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_null_fb = 12
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_abort_fb = 13
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_allocator_fb = 14
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_environment = 15
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_nearest = 16
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_blocked = 17
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_interleaved = 18
-
-        integer (kind=omp_allocator_handle_kind), parameter :: omp_null_allocator = 0
-        integer (kind=omp_allocator_handle_kind), parameter :: omp_default_mem_alloc = 1
-        integer (kind=omp_allocator_handle_kind), parameter :: omp_large_cap_mem_alloc = 2
-        integer (kind=omp_allocator_handle_kind), parameter :: omp_const_mem_alloc = 3
-        integer (kind=omp_allocator_handle_kind), parameter :: omp_high_bw_mem_alloc = 4
-        integer (kind=omp_allocator_handle_kind), parameter :: omp_low_lat_mem_alloc = 5
-        integer (kind=omp_allocator_handle_kind), parameter :: omp_cgroup_mem_alloc = 6
-        integer (kind=omp_allocator_handle_kind), parameter :: omp_pteam_mem_alloc = 7
-        integer (kind=omp_allocator_handle_kind), parameter :: omp_thread_mem_alloc = 8
-
-        integer (kind=omp_memspace_handle_kind), parameter :: omp_default_mem_space = 0
-        integer (kind=omp_memspace_handle_kind), parameter :: omp_large_cap_mem_space = 1
-        integer (kind=omp_memspace_handle_kind), parameter :: omp_const_mem_space = 2
-        integer (kind=omp_memspace_handle_kind), parameter :: omp_high_bw_mem_space = 3
-        integer (kind=omp_memspace_handle_kind), parameter :: omp_low_lat_mem_space = 4
-
-        integer (kind=omp_pause_resource_kind), parameter :: omp_pause_resume = 0
-        integer (kind=omp_pause_resource_kind), parameter :: omp_pause_soft = 1
-        integer (kind=omp_pause_resource_kind), parameter :: omp_pause_hard = 2
-
-        interface
-
-!         ***
-!         *** omp_* entry points
-!         ***
-
-          subroutine omp_set_num_threads(num_threads)
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) num_threads
-          end subroutine omp_set_num_threads
-
-          subroutine omp_set_dynamic(dynamic_threads)
-            use omp_lib_kinds
-            logical (kind=omp_logical_kind) dynamic_threads
-          end subroutine omp_set_dynamic
-
-          subroutine omp_set_nested(nested)
-            use omp_lib_kinds
-            logical (kind=omp_logical_kind) nested
-          end subroutine omp_set_nested
-
-          function omp_get_num_threads()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_num_threads
-          end function omp_get_num_threads
-
-          function omp_get_max_threads()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_max_threads
-          end function omp_get_max_threads
-
-          function omp_get_thread_num()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_thread_num
-          end function omp_get_thread_num
-
-          function omp_get_num_procs()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_num_procs
-          end function omp_get_num_procs
-
-          function omp_in_parallel()
-            use omp_lib_kinds
-            logical (kind=omp_logical_kind) omp_in_parallel
-          end function omp_in_parallel
-
-          function omp_in_final()
-            use omp_lib_kinds
-            logical (kind=omp_logical_kind) omp_in_final
-          end function omp_in_final
-
-          function omp_get_dynamic()
-            use omp_lib_kinds
-            logical (kind=omp_logical_kind) omp_get_dynamic
-          end function omp_get_dynamic
-
-          function omp_get_nested()
-            use omp_lib_kinds
-            logical (kind=omp_logical_kind) omp_get_nested
-          end function omp_get_nested
-
-          function omp_get_thread_limit()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_thread_limit
-          end function omp_get_thread_limit
-
-          subroutine omp_set_max_active_levels(max_levels)
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) max_levels
-          end subroutine omp_set_max_active_levels
-
-          function omp_get_max_active_levels()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_max_active_levels
-          end function omp_get_max_active_levels
-
-          function omp_get_level()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_level
-          end function omp_get_level
-
-          function omp_get_active_level()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_active_level
-          end function omp_get_active_level
-
-          function omp_get_ancestor_thread_num(level)
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) level
-            integer (kind=omp_integer_kind) omp_get_ancestor_thread_num
-          end function omp_get_ancestor_thread_num
-
-          function omp_get_team_size(level)
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) level
-            integer (kind=omp_integer_kind) omp_get_team_size
-          end function omp_get_team_size
-
-          subroutine omp_set_schedule(kind, chunk_size)
-            use omp_lib_kinds
-            integer (kind=omp_sched_kind) kind
-            integer (kind=omp_integer_kind) chunk_size
-          end subroutine omp_set_schedule
-
-          subroutine omp_get_schedule(kind, chunk_size)
-            use omp_lib_kinds
-            integer (kind=omp_sched_kind) kind
-            integer (kind=omp_integer_kind) chunk_size
-          end subroutine omp_get_schedule
-
-          function omp_get_proc_bind()
-            use omp_lib_kinds
-            integer (kind=omp_proc_bind_kind) omp_get_proc_bind
-          end function omp_get_proc_bind
-
-          function omp_get_num_places()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_num_places
-          end function omp_get_num_places
-
-          function omp_get_place_num_procs(place_num)
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) place_num
-            integer (kind=omp_integer_kind) omp_get_place_num_procs
-          end function omp_get_place_num_procs
-
-          subroutine omp_get_place_proc_ids(place_num, ids)
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) place_num
-            integer (kind=omp_integer_kind) ids(*)
-          end subroutine omp_get_place_proc_ids
-
-          function omp_get_place_num()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_place_num
-          end function omp_get_place_num
-
-          function omp_get_partition_num_places()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_partition_num_places
-          end function omp_get_partition_num_places
-
-          subroutine omp_get_partition_place_nums(place_nums)
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) place_nums(*)
-          end subroutine omp_get_partition_place_nums
-
-          function omp_get_wtime()
-            double precision omp_get_wtime
-          end function omp_get_wtime
-
-          function omp_get_wtick ()
-            double precision omp_get_wtick
-          end function omp_get_wtick
-
-          function omp_get_default_device()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_default_device
-          end function omp_get_default_device
-
-          subroutine omp_set_default_device(device_num)
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) device_num
-          end subroutine omp_set_default_device
-
-          function omp_get_num_devices()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_num_devices
-          end function omp_get_num_devices
-
-          function omp_get_num_teams()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_num_teams
-          end function omp_get_num_teams
-
-          function omp_get_team_num()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_team_num
-          end function omp_get_team_num
-
-          function omp_get_cancellation()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_cancellation
-          end function omp_get_cancellation
-
-          function omp_is_initial_device()
-            use omp_lib_kinds
-            logical (kind=omp_logical_kind) omp_is_initial_device
-          end function omp_is_initial_device
-
-          function omp_get_initial_device()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_initial_device
-          end function omp_get_initial_device
-
-          function omp_get_device_num()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_device_num
-          end function omp_get_device_num
-
-          function omp_pause_resource(kind, device_num)
-            use omp_lib_kinds
-            integer (kind=omp_pause_resource_kind) kind
-            integer (kind=omp_integer_kind) device_num
-            integer (kind=omp_integer_kind) omp_pause_resource
-          end function omp_pause_resource
-
-          function omp_pause_resource_all(kind)
-            use omp_lib_kinds
-            integer (kind=omp_pause_resource_kind) kind
-            integer (kind=omp_integer_kind) omp_pause_resource_all
-          end function omp_pause_resource_all
-
-          function omp_get_supported_active_levels()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_supported_active_levels
-          end function omp_get_supported_active_levels
-
-          subroutine omp_fulfill_event(event)
-            use omp_lib_kinds
-            integer (kind=omp_event_handle_kind) event
-          end subroutine omp_fulfill_event
-
-          subroutine omp_init_lock(svar)
-!DIR$ IF(__INTEL_COMPILER.GE.1400)
-!DIR$ attributes known_intrinsic :: omp_init_lock
-!DIR$ ENDIF
-            use omp_lib_kinds
-            integer (kind=omp_lock_kind) svar
-          end subroutine omp_init_lock
-
-          subroutine omp_destroy_lock(svar)
-!DIR$ IF(__INTEL_COMPILER.GE.1400)
-!DIR$ attributes known_intrinsic :: omp_destroy_lock
-!DIR$ ENDIF
-            use omp_lib_kinds
-            integer (kind=omp_lock_kind) svar
-          end subroutine omp_destroy_lock
-
-          subroutine omp_set_lock(svar)
-!DIR$ IF(__INTEL_COMPILER.GE.1400)
-!DIR$ attributes known_intrinsic :: omp_set_lock
-!DIR$ ENDIF
-            use omp_lib_kinds
-            integer (kind=omp_lock_kind) svar
-          end subroutine omp_set_lock
-
-          subroutine omp_unset_lock(svar)
-!DIR$ IF(__INTEL_COMPILER.GE.1400)
-!DIR$ attributes known_intrinsic :: omp_unset_lock
-!DIR$ ENDIF
-            use omp_lib_kinds
-            integer (kind=omp_lock_kind) svar
-          end subroutine omp_unset_lock
-
-          function omp_test_lock(svar)
-!DIR$ IF(__INTEL_COMPILER.GE.1400)
-!DIR$ attributes known_intrinsic :: omp_test_lock
-!DIR$ ENDIF
-            use omp_lib_kinds
-            logical (kind=omp_logical_kind) omp_test_lock
-            integer (kind=omp_lock_kind) svar
-          end function omp_test_lock
-
-          subroutine omp_init_nest_lock(nvar)
-!DIR$ IF(__INTEL_COMPILER.GE.1400)
-!DIR$ attributes known_intrinsic :: omp_init_nest_lock
-!DIR$ ENDIF
-            use omp_lib_kinds
-            integer (kind=omp_nest_lock_kind) nvar
-          end subroutine omp_init_nest_lock
-
-          subroutine omp_destroy_nest_lock(nvar)
-!DIR$ IF(__INTEL_COMPILER.GE.1400)
-!DIR$ attributes known_intrinsic :: omp_destroy_nest_lock
-!DIR$ ENDIF
-            use omp_lib_kinds
-            integer (kind=omp_nest_lock_kind) nvar
-          end subroutine omp_destroy_nest_lock
-
-          subroutine omp_set_nest_lock(nvar)
-!DIR$ IF(__INTEL_COMPILER.GE.1400)
-!DIR$ attributes known_intrinsic :: omp_set_nest_lock
-!DIR$ ENDIF
-            use omp_lib_kinds
-            integer (kind=omp_nest_lock_kind) nvar
-          end subroutine omp_set_nest_lock
-
-          subroutine omp_unset_nest_lock(nvar)
-!DIR$ IF(__INTEL_COMPILER.GE.1400)
-!DIR$ attributes known_intrinsic :: omp_unset_nest_lock
-!DIR$ ENDIF
-            use omp_lib_kinds
-            integer (kind=omp_nest_lock_kind) nvar
-          end subroutine omp_unset_nest_lock
-
-          function omp_test_nest_lock(nvar)
-!DIR$ IF(__INTEL_COMPILER.GE.1400)
-!DIR$ attributes known_intrinsic :: omp_test_nest_lock
-!DIR$ ENDIF
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_test_nest_lock
-            integer (kind=omp_nest_lock_kind) nvar
-          end function omp_test_nest_lock
-
-          function omp_get_max_task_priority()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_max_task_priority
-          end function omp_get_max_task_priority
-
-          function omp_init_allocator(memspace, ntraits, traits)
-            use omp_lib_kinds
-            integer (omp_allocator_handle_kind) omp_init_allocator
-            integer (omp_memspace_handle_kind) :: memspace
-            integer (omp_integer_kind) :: ntraits
-            type(omp_alloctrait), intent(in) :: traits(*)
-          end function omp_init_allocator
-
-          subroutine omp_destroy_allocator(allocator) bind(c)
-            use omp_lib_kinds
-            integer (omp_allocator_handle_kind), value :: allocator
-          end subroutine omp_destroy_allocator
-
-          subroutine omp_set_default_allocator(allocator) bind(c)
-            use omp_lib_kinds
-            integer (omp_allocator_handle_kind) allocator
-          end subroutine omp_set_default_allocator
-
-          function omp_get_default_allocator() bind(c)
-            use omp_lib_kinds
-            integer(omp_allocator_handle_kind)omp_get_default_allocator
-          end function omp_get_default_allocator
-
-          subroutine omp_set_affinity_format(format)
-            character (len=*) format
-          end subroutine omp_set_affinity_format
-
-          function omp_get_affinity_format(buffer)
-            use omp_lib_kinds
-            character (len=*) buffer
-            integer (kind=kmp_size_t_kind) omp_get_affinity_format
-          end function omp_get_affinity_format
-
-          subroutine omp_display_affinity(format)
-            character (len=*) format
-          end subroutine omp_display_affinity
-
-          function omp_capture_affinity(buffer, format)
-            use omp_lib_kinds
-            character (len=*) format
-            character (len=*) buffer
-            integer (kind=kmp_size_t_kind) omp_capture_affinity
-          end function omp_capture_affinity
-
-!         ***
-!         *** kmp_* entry points
-!         ***
-
-          subroutine kmp_set_stacksize(size)
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) size
-          end subroutine kmp_set_stacksize
-
-          subroutine kmp_set_stacksize_s(size)
-            use omp_lib_kinds
-            integer (kind=kmp_size_t_kind) size
-          end subroutine kmp_set_stacksize_s
-
-          subroutine kmp_set_blocktime(msec)
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) msec
-          end subroutine kmp_set_blocktime
-
-          subroutine kmp_set_library_serial()
-          end subroutine kmp_set_library_serial
-
-          subroutine kmp_set_library_turnaround()
-          end subroutine kmp_set_library_turnaround
-
-          subroutine kmp_set_library_throughput()
-          end subroutine kmp_set_library_throughput
-
-          subroutine kmp_set_library(libnum)
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) libnum
-          end subroutine kmp_set_library
-
-          subroutine kmp_set_defaults(string)
-            character*(*) string
-          end subroutine kmp_set_defaults
-
-          function kmp_get_stacksize()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) kmp_get_stacksize
-          end function kmp_get_stacksize
-
-          function kmp_get_stacksize_s()
-            use omp_lib_kinds
-            integer (kind=kmp_size_t_kind) kmp_get_stacksize_s
-          end function kmp_get_stacksize_s
-
-          function kmp_get_blocktime()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) kmp_get_blocktime
-          end function kmp_get_blocktime
-
-          function kmp_get_library()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) kmp_get_library
-          end function kmp_get_library
-
-          subroutine kmp_set_disp_num_buffers(num)
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) num
-          end subroutine kmp_set_disp_num_buffers
-
-          function kmp_set_affinity(mask)
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) kmp_set_affinity
-            integer (kind=kmp_affinity_mask_kind) mask
-          end function kmp_set_affinity
-
-          function kmp_get_affinity(mask)
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) kmp_get_affinity
-            integer (kind=kmp_affinity_mask_kind) mask
-          end function kmp_get_affinity
-
-          function kmp_get_affinity_max_proc()
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) kmp_get_affinity_max_proc
-          end function kmp_get_affinity_max_proc
-
-          subroutine kmp_create_affinity_mask(mask)
-            use omp_lib_kinds
-            integer (kind=kmp_affinity_mask_kind) mask
-          end subroutine kmp_create_affinity_mask
-
-          subroutine kmp_destroy_affinity_mask(mask)
-            use omp_lib_kinds
-            integer (kind=kmp_affinity_mask_kind) mask
-          end subroutine kmp_destroy_affinity_mask
-
-          function kmp_set_affinity_mask_proc(proc, mask)
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) kmp_set_affinity_mask_proc
-            integer (kind=omp_integer_kind) proc
-            integer (kind=kmp_affinity_mask_kind) mask
-          end function kmp_set_affinity_mask_proc
-
-          function kmp_unset_affinity_mask_proc(proc, mask)
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) kmp_unset_affinity_mask_proc
-            integer (kind=omp_integer_kind) proc
-            integer (kind=kmp_affinity_mask_kind) mask
-          end function kmp_unset_affinity_mask_proc
-
-          function kmp_get_affinity_mask_proc(proc, mask)
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) kmp_get_affinity_mask_proc
-            integer (kind=omp_integer_kind) proc
-            integer (kind=kmp_affinity_mask_kind) mask
-          end function kmp_get_affinity_mask_proc
-
-          function kmp_malloc(size)
-            use omp_lib_kinds
-            integer (kind=kmp_pointer_kind) kmp_malloc
-            integer (kind=kmp_size_t_kind) size
-          end function kmp_malloc
-
-          function kmp_aligned_malloc(size, alignment)
-            use omp_lib_kinds
-            integer (kind=kmp_pointer_kind) kmp_aligned_malloc
-            integer (kind=kmp_size_t_kind) size
-            integer (kind=kmp_size_t_kind) alignment
-          end function kmp_aligned_malloc
-
-          function kmp_calloc(nelem, elsize)
-            use omp_lib_kinds
-            integer (kind=kmp_pointer_kind) kmp_calloc
-            integer (kind=kmp_size_t_kind) nelem
-            integer (kind=kmp_size_t_kind) elsize
-          end function kmp_calloc
-
-          function kmp_realloc(ptr, size)
-            use omp_lib_kinds
-            integer (kind=kmp_pointer_kind) kmp_realloc
-            integer (kind=kmp_pointer_kind) ptr
-            integer (kind=kmp_size_t_kind) size
-          end function kmp_realloc
-
-          subroutine kmp_free(ptr)
-            use omp_lib_kinds
-            integer (kind=kmp_pointer_kind) ptr
-          end subroutine kmp_free
-
-          subroutine kmp_set_warnings_on()
-          end subroutine kmp_set_warnings_on
-
-          subroutine kmp_set_warnings_off()
-          end subroutine kmp_set_warnings_off
-
-          function kmp_get_cancellation_status(cancelkind)
-            use omp_lib_kinds
-            integer (kind=kmp_cancel_kind) cancelkind
-            logical (kind=omp_logical_kind) kmp_get_cancellation_status
-          end function kmp_get_cancellation_status
-
-          subroutine omp_init_lock_with_hint(svar, hint)
-            use omp_lib_kinds
-            integer (kind=omp_lock_kind) svar
-            integer (kind=omp_lock_hint_kind) hint
-          end subroutine omp_init_lock_with_hint
-
-          subroutine omp_init_nest_lock_with_hint(nvar, hint)
-            use omp_lib_kinds
-            integer (kind=omp_nest_lock_kind) nvar
-            integer (kind=omp_lock_hint_kind) hint
-          end subroutine omp_init_nest_lock_with_hint
-
-          function omp_control_tool(command, modifier)
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_control_tool
-            integer (kind=omp_control_tool_kind) command
-            integer (kind=omp_control_tool_kind) modifier
-          end function omp_control_tool
-
-        end interface
-
-!dec$ if defined(_WIN32)
-!dec$   if defined(_WIN64) .or. defined(_M_AMD64)
-
-!***
-!*** The Fortran entry points must be in uppercase, even if the /Qlowercase
-!*** option is specified.  The alias attribute ensures that the specified
-!*** string is used as the entry point.
-!***
-!*** On the Windows* OS IA-32 architecture, the Fortran entry points have an
-!*** underscore prepended.  On the Windows* OS Intel(R) 64
-!*** architecture, no underscore is prepended.
-!***
-
-!dec$ attributes alias:'OMP_SET_NUM_THREADS' :: omp_set_num_threads
-!dec$ attributes alias:'OMP_SET_DYNAMIC' :: omp_set_dynamic
-!dec$ attributes alias:'OMP_SET_NESTED' :: omp_set_nested
-!dec$ attributes alias:'OMP_GET_NUM_THREADS' :: omp_get_num_threads
-!dec$ attributes alias:'OMP_GET_MAX_THREADS' :: omp_get_max_threads
-!dec$ attributes alias:'OMP_GET_THREAD_NUM' :: omp_get_thread_num
-!dec$ attributes alias:'OMP_GET_NUM_PROCS' :: omp_get_num_procs
-!dec$ attributes alias:'OMP_IN_PARALLEL' :: omp_in_parallel
-!dec$ attributes alias:'OMP_GET_DYNAMIC' :: omp_get_dynamic
-!dec$ attributes alias:'OMP_GET_NESTED' :: omp_get_nested
-!dec$ attributes alias:'OMP_GET_THREAD_LIMIT' :: omp_get_thread_limit
-!dec$ attributes alias:'OMP_SET_MAX_ACTIVE_LEVELS' :: omp_set_max_active_levels
-!dec$ attributes alias:'OMP_GET_MAX_ACTIVE_LEVELS' :: omp_get_max_active_levels
-!dec$ attributes alias:'OMP_GET_LEVEL' :: omp_get_level
-!dec$ attributes alias:'OMP_GET_ACTIVE_LEVEL' :: omp_get_active_level
-!dec$ attributes alias:'OMP_GET_ANCESTOR_THREAD_NUM' :: omp_get_ancestor_thread_num
-!dec$ attributes alias:'OMP_GET_TEAM_SIZE' :: omp_get_team_size
-!dec$ attributes alias:'OMP_SET_SCHEDULE' :: omp_set_schedule
-!dec$ attributes alias:'OMP_GET_SCHEDULE' :: omp_get_schedule
-!dec$ attributes alias:'OMP_GET_PROC_BIND' :: omp_get_proc_bind
-!dec$ attributes alias:'OMP_GET_WTIME' :: omp_get_wtime
-!dec$ attributes alias:'OMP_GET_WTICK' :: omp_get_wtick
-!dec$ attributes alias:'OMP_GET_DEFAULT_DEVICE' :: omp_get_default_device
-!dec$ attributes alias:'OMP_SET_DEFAULT_DEVICE' :: omp_set_default_device
-!dec$ attributes alias:'OMP_GET_NUM_DEVICES' :: omp_get_num_devices
-!dec$ attributes alias:'OMP_GET_NUM_TEAMS' :: omp_get_num_teams
-!dec$ attributes alias:'OMP_GET_TEAM_NUM' :: omp_get_team_num
-!dec$ attributes alias:'OMP_GET_CANCELLATION' :: omp_get_cancellation
-!dec$ attributes alias:'OMP_IS_INITIAL_DEVICE' :: omp_is_initial_device
-!dec$ attributes alias:'OMP_GET_INITIAL_DEVICE' :: omp_get_initial_device
-!dec$ attributes alias:'OMP_GET_MAX_TASK_PRIORITY' :: omp_get_max_task_priority
-!dec$ attributes alias:'OMP_GET_DEVICE_NUM' :: omp_get_device_num
-!dec$ attributes alias:'OMP_PAUSE_RESOURCE' :: omp_pause_resource
-!dec$ attributes alias:'OMP_PAUSE_RESOURCE_ALL' :: omp_pause_resource_all
-!dec$ attributes alias:'OMP_GET_SUPPORTED_ACTIVE_LEVELS' :: omp_get_supported_active_levels
-!dec$ attributes alias:'OMP_FULFILL_EVENT' :: omp_fulfill_event
-
-!dec$ attributes alias:'OMP_CONTROL_TOOL' :: omp_control_tool
-!dec$ attributes alias:'OMP_SET_AFFINITY_FORMAT' :: omp_set_affinity_format
-!dec$ attributes alias:'OMP_GET_AFFINITY_FORMAT' :: omp_get_affinity_format
-!dec$ attributes alias:'OMP_DISPLAY_AFFINITY' :: omp_display_affinity
-!dec$ attributes alias:'OMP_CAPTURE_AFFINITY' :: omp_capture_affinity
-
-!dec$ attributes alias:'omp_init_lock' :: omp_init_lock
-!dec$ attributes alias:'omp_init_lock_with_hint' :: omp_init_lock_with_hint
-!dec$ attributes alias:'omp_destroy_lock' :: omp_destroy_lock
-!dec$ attributes alias:'omp_set_lock' :: omp_set_lock
-!dec$ attributes alias:'omp_unset_lock' :: omp_unset_lock
-!dec$ attributes alias:'omp_test_lock' :: omp_test_lock
-!dec$ attributes alias:'omp_init_nest_lock' :: omp_init_nest_lock
-!dec$ attributes alias:'omp_init_nest_lock_with_hint' :: omp_init_nest_lock_with_hint
-!dec$ attributes alias:'omp_destroy_nest_lock' :: omp_destroy_nest_lock
-!dec$ attributes alias:'omp_set_nest_lock' :: omp_set_nest_lock
-!dec$ attributes alias:'omp_unset_nest_lock' :: omp_unset_nest_lock
-!dec$ attributes alias:'omp_test_nest_lock' :: omp_test_nest_lock
-
-!dec$ attributes alias:'KMP_SET_STACKSIZE'::kmp_set_stacksize
-!dec$ attributes alias:'KMP_SET_STACKSIZE_S'::kmp_set_stacksize_s
-!dec$ attributes alias:'KMP_SET_BLOCKTIME'::kmp_set_blocktime
-!dec$ attributes alias:'KMP_SET_LIBRARY_SERIAL'::kmp_set_library_serial
-!dec$ attributes alias:'KMP_SET_LIBRARY_TURNAROUND'::kmp_set_library_turnaround
-!dec$ attributes alias:'KMP_SET_LIBRARY_THROUGHPUT'::kmp_set_library_throughput
-!dec$ attributes alias:'KMP_SET_LIBRARY'::kmp_set_library
-!dec$ attributes alias:'KMP_GET_STACKSIZE'::kmp_get_stacksize
-!dec$ attributes alias:'KMP_GET_STACKSIZE_S'::kmp_get_stacksize_s
-!dec$ attributes alias:'KMP_GET_BLOCKTIME'::kmp_get_blocktime
-!dec$ attributes alias:'KMP_GET_LIBRARY'::kmp_get_library
-!dec$ attributes alias:'KMP_SET_AFFINITY'::kmp_set_affinity
-!dec$ attributes alias:'KMP_GET_AFFINITY'::kmp_get_affinity
-!dec$ attributes alias:'KMP_GET_AFFINITY_MAX_PROC'::kmp_get_affinity_max_proc
-!dec$ attributes alias:'KMP_CREATE_AFFINITY_MASK'::kmp_create_affinity_mask
-!dec$ attributes alias:'KMP_DESTROY_AFFINITY_MASK'::kmp_destroy_affinity_mask
-!dec$ attributes alias:'KMP_SET_AFFINITY_MASK_PROC'::kmp_set_affinity_mask_proc
-!dec$ attributes alias:'KMP_UNSET_AFFINITY_MASK_PROC'::kmp_unset_affinity_mask_proc
-!dec$ attributes alias:'KMP_GET_AFFINITY_MASK_PROC'::kmp_get_affinity_mask_proc
-!dec$ attributes alias:'KMP_MALLOC'::kmp_malloc
-!dec$ attributes alias:'KMP_ALIGNED_MALLOC'::kmp_aligned_malloc
-!dec$ attributes alias:'KMP_CALLOC'::kmp_calloc
-!dec$ attributes alias:'KMP_REALLOC'::kmp_realloc
-!dec$ attributes alias:'KMP_FREE'::kmp_free
-
-!dec$ attributes alias:'KMP_SET_WARNINGS_ON'::kmp_set_warnings_on
-!dec$ attributes alias:'KMP_SET_WARNINGS_OFF'::kmp_set_warnings_off
-
-!dec$ attributes alias:'KMP_GET_CANCELLATION_STATUS' :: kmp_get_cancellation_status
-
-!dec$   else
-
-!***
-!*** On Windows* OS IA-32 architecture, the Fortran entry points have an underscore prepended.
-!***
-
-!dec$ attributes alias:'_OMP_SET_NUM_THREADS' :: omp_set_num_threads
-!dec$ attributes alias:'_OMP_SET_DYNAMIC' :: omp_set_dynamic
-!dec$ attributes alias:'_OMP_SET_NESTED' :: omp_set_nested
-!dec$ attributes alias:'_OMP_GET_NUM_THREADS' :: omp_get_num_threads
-!dec$ attributes alias:'_OMP_GET_MAX_THREADS' :: omp_get_max_threads
-!dec$ attributes alias:'_OMP_GET_THREAD_NUM' :: omp_get_thread_num
-!dec$ attributes alias:'_OMP_GET_NUM_PROCS' :: omp_get_num_procs
-!dec$ attributes alias:'_OMP_IN_PARALLEL' :: omp_in_parallel
-!dec$ attributes alias:'_OMP_GET_DYNAMIC' :: omp_get_dynamic
-!dec$ attributes alias:'_OMP_GET_NESTED' :: omp_get_nested
-!dec$ attributes alias:'_OMP_GET_THREAD_LIMIT' :: omp_get_thread_limit
-!dec$ attributes alias:'_OMP_SET_MAX_ACTIVE_LEVELS' :: omp_set_max_active_levels
-!dec$ attributes alias:'_OMP_GET_MAX_ACTIVE_LEVELS' :: omp_get_max_active_levels
-!dec$ attributes alias:'_OMP_GET_LEVEL' :: omp_get_level
-!dec$ attributes alias:'_OMP_GET_ACTIVE_LEVEL' :: omp_get_active_level
-!dec$ attributes alias:'_OMP_GET_ANCESTOR_THREAD_NUM' :: omp_get_ancestor_thread_num
-!dec$ attributes alias:'_OMP_GET_TEAM_SIZE' :: omp_get_team_size
-!dec$ attributes alias:'_OMP_SET_SCHEDULE' :: omp_set_schedule
-!dec$ attributes alias:'_OMP_GET_SCHEDULE' :: omp_get_schedule
-!dec$ attributes alias:'_OMP_GET_PROC_BIND' :: omp_get_proc_bind
-!dec$ attributes alias:'_OMP_GET_WTIME' :: omp_get_wtime
-!dec$ attributes alias:'_OMP_GET_WTICK' :: omp_get_wtick
-!dec$ attributes alias:'_OMP_GET_DEFAULT_DEVICE' :: omp_get_default_device
-!dec$ attributes alias:'_OMP_SET_DEFAULT_DEVICE' :: omp_set_default_device
-!dec$ attributes alias:'_OMP_GET_NUM_DEVICES' :: omp_get_num_devices
-!dec$ attributes alias:'_OMP_GET_NUM_TEAMS' :: omp_get_num_teams
-!dec$ attributes alias:'_OMP_GET_TEAM_NUM' :: omp_get_team_num
-!dec$ attributes alias:'_OMP_GET_CANCELLATION' :: omp_get_cancellation
-!dec$ attributes alias:'_OMP_IS_INITIAL_DEVICE' :: omp_is_initial_device
-!dec$ attributes alias:'_OMP_GET_INITIAL_DEVICE' :: omp_get_initial_device
-!dec$ attributes alias:'_OMP_GET_MAX_TASK_PRIORTY' :: omp_get_max_task_priority
-!dec$ attributes alias:'_OMP_GET_DEVICE_NUM' :: omp_get_device_num
-!dec$ attributes alias:'_OMP_PAUSE_RESOURCE' :: omp_pause_resource
-!dec$ attributes alias:'_OMP_PAUSE_RESOURCE_ALL' :: omp_pause_resource_all
-!dec$ attributes alias:'_OMP_GET_SUPPORTED_ACTIVE_LEVELS' :: omp_get_supported_active_levels
-!dec$ attributes alias:'_OMP_FULFILL_EVENT' :: omp_fulfill_event
-
-!dec$ attributes alias:'_OMP_CONTROL_TOOL' :: omp_control_tool
-!dec$ attributes alias:'_OMP_SET_AFFINITY_FORMAT' :: omp_set_affinity_format
-!dec$ attributes alias:'_OMP_GET_AFFINITY_FORMAT' :: omp_get_affinity_format
-!dec$ attributes alias:'_OMP_DISPLAY_AFFINITY' :: omp_display_affinity
-!dec$ attributes alias:'_OMP_CAPTURE_AFFINITY' :: omp_capture_affinity
-
-!dec$ attributes alias:'_omp_init_lock' :: omp_init_lock
-!dec$ attributes alias:'_omp_init_lock_with_hint' :: omp_init_lock_with_hint
-!dec$ attributes alias:'_omp_destroy_lock' :: omp_destroy_lock
-!dec$ attributes alias:'_omp_set_lock' :: omp_set_lock
-!dec$ attributes alias:'_omp_unset_lock' :: omp_unset_lock
-!dec$ attributes alias:'_omp_test_lock' :: omp_test_lock
-!dec$ attributes alias:'_omp_init_nest_lock' :: omp_init_nest_lock
-!dec$ attributes alias:'_omp_init_nest_lock_with_hint' :: omp_init_nest_lock_with_hint
-!dec$ attributes alias:'_omp_destroy_nest_lock' :: omp_destroy_nest_lock
-!dec$ attributes alias:'_omp_set_nest_lock' :: omp_set_nest_lock
-!dec$ attributes alias:'_omp_unset_nest_lock' :: omp_unset_nest_lock
-!dec$ attributes alias:'_omp_test_nest_lock' :: omp_test_nest_lock
-
-!dec$ attributes alias:'_KMP_SET_STACKSIZE'::kmp_set_stacksize
-!dec$ attributes alias:'_KMP_SET_STACKSIZE_S'::kmp_set_stacksize_s
-!dec$ attributes alias:'_KMP_SET_BLOCKTIME'::kmp_set_blocktime
-!dec$ attributes alias:'_KMP_SET_LIBRARY_SERIAL'::kmp_set_library_serial
-!dec$ attributes alias:'_KMP_SET_LIBRARY_TURNAROUND'::kmp_set_library_turnaround
-!dec$ attributes alias:'_KMP_SET_LIBRARY_THROUGHPUT'::kmp_set_library_throughput
-!dec$ attributes alias:'_KMP_SET_LIBRARY'::kmp_set_library
-!dec$ attributes alias:'_KMP_GET_STACKSIZE'::kmp_get_stacksize
-!dec$ attributes alias:'_KMP_GET_STACKSIZE_S'::kmp_get_stacksize_s
-!dec$ attributes alias:'_KMP_GET_BLOCKTIME'::kmp_get_blocktime
-!dec$ attributes alias:'_KMP_GET_LIBRARY'::kmp_get_library
-!dec$ attributes alias:'_KMP_SET_AFFINITY'::kmp_set_affinity
-!dec$ attributes alias:'_KMP_GET_AFFINITY'::kmp_get_affinity
-!dec$ attributes alias:'_KMP_GET_AFFINITY_MAX_PROC'::kmp_get_affinity_max_proc
-!dec$ attributes alias:'_KMP_CREATE_AFFINITY_MASK'::kmp_create_affinity_mask
-!dec$ attributes alias:'_KMP_DESTROY_AFFINITY_MASK'::kmp_destroy_affinity_mask
-!dec$ attributes alias:'_KMP_SET_AFFINITY_MASK_PROC'::kmp_set_affinity_mask_proc
-!dec$ attributes alias:'_KMP_UNSET_AFFINITY_MASK_PROC'::kmp_unset_affinity_mask_proc
-!dec$ attributes alias:'_KMP_GET_AFFINITY_MASK_PROC'::kmp_get_affinity_mask_proc
-!dec$ attributes alias:'_KMP_MALLOC'::kmp_malloc
-!dec$ attributes alias:'_KMP_ALIGNED_MALLOC'::kmp_aligned_malloc
-!dec$ attributes alias:'_KMP_CALLOC'::kmp_calloc
-!dec$ attributes alias:'_KMP_REALLOC'::kmp_realloc
-!dec$ attributes alias:'_KMP_FREE'::kmp_free
-
-!dec$ attributes alias:'_KMP_SET_WARNINGS_ON'::kmp_set_warnings_on
-!dec$ attributes alias:'_KMP_SET_WARNINGS_OFF'::kmp_set_warnings_off
-
-!dec$ attributes alias:'_KMP_GET_CANCELLATION_STATUS' :: kmp_get_cancellation_status
-
-!dec$   endif
-!dec$ endif
-
-!dec$ if defined(__linux)
-
-!***
-!*** The Linux* OS entry points are in lowercase, with an underscore appended.
-!***
-
-!dec$ attributes alias:'omp_set_num_threads_'::omp_set_num_threads
-!dec$ attributes alias:'omp_set_dynamic_'::omp_set_dynamic
-!dec$ attributes alias:'omp_set_nested_'::omp_set_nested
-!dec$ attributes alias:'omp_get_num_threads_'::omp_get_num_threads
-!dec$ attributes alias:'omp_get_max_threads_'::omp_get_max_threads
-!dec$ attributes alias:'omp_get_thread_num_'::omp_get_thread_num
-!dec$ attributes alias:'omp_get_num_procs_'::omp_get_num_procs
-!dec$ attributes alias:'omp_in_parallel_'::omp_in_parallel
-!dec$ attributes alias:'omp_get_dynamic_'::omp_get_dynamic
-!dec$ attributes alias:'omp_get_nested_'::omp_get_nested
-!dec$ attributes alias:'omp_get_thread_limit_'::omp_get_thread_limit
-!dec$ attributes alias:'omp_set_max_active_levels_'::omp_set_max_active_levels
-!dec$ attributes alias:'omp_get_max_active_levels_'::omp_get_max_active_levels
-!dec$ attributes alias:'omp_get_level_'::omp_get_level
-!dec$ attributes alias:'omp_get_active_level_'::omp_get_active_level
-!dec$ attributes alias:'omp_get_ancestor_thread_num_'::omp_get_ancestor_thread_num
-!dec$ attributes alias:'omp_get_team_size_'::omp_get_team_size
-!dec$ attributes alias:'omp_set_schedule_'::omp_set_schedule
-!dec$ attributes alias:'omp_get_schedule_'::omp_get_schedule
-!dec$ attributes alias:'omp_get_proc_bind_' :: omp_get_proc_bind
-!dec$ attributes alias:'omp_get_wtime_'::omp_get_wtime
-!dec$ attributes alias:'omp_get_wtick_'::omp_get_wtick
-!dec$ attributes alias:'omp_get_default_device_'::omp_get_default_device
-!dec$ attributes alias:'omp_set_default_device_'::omp_set_default_device
-!dec$ attributes alias:'omp_get_num_devices_'::omp_get_num_devices
-!dec$ attributes alias:'omp_get_num_teams_'::omp_get_num_teams
-!dec$ attributes alias:'omp_get_team_num_'::omp_get_team_num
-!dec$ attributes alias:'omp_get_cancellation_'::omp_get_cancellation
-!dec$ attributes alias:'omp_is_initial_device_'::omp_is_initial_device
-!dec$ attributes alias:'omp_get_initial_device_'::omp_get_initial_device
-!dec$ attributes alias:'omp_get_max_task_priority_'::omp_get_max_task_priority
-!dec$ attributes alias:'omp_get_device_num_'::omp_get_device_num
-!dec$ attributes alias:'omp_pause_resource_' :: omp_pause_resource
-!dec$ attributes alias:'omp_pause_resource_all_' :: omp_pause_resource_all
-!dec$ attributes alias:'omp_get_supported_active_levels_' :: omp_get_supported_active_levels
-!dec$ attributes alias:'omp_fulfill_event_' :: omp_fulfill_event
-
-!dec$ attributes alias:'omp_set_affinity_format_' :: omp_set_affinity_format
-!dec$ attributes alias:'omp_get_affinity_format_' :: omp_get_affinity_format
-!dec$ attributes alias:'omp_display_affinity_' :: omp_display_affinity
-!dec$ attributes alias:'omp_capture_affinity_' :: omp_capture_affinity
-
-!dec$ attributes alias:'omp_init_lock_'::omp_init_lock
-!dec$ attributes alias:'omp_init_lock_with_hint_'::omp_init_lock_with_hint
-!dec$ attributes alias:'omp_destroy_lock_'::omp_destroy_lock
-!dec$ attributes alias:'omp_set_lock_'::omp_set_lock
-!dec$ attributes alias:'omp_unset_lock_'::omp_unset_lock
-!dec$ attributes alias:'omp_test_lock_'::omp_test_lock
-!dec$ attributes alias:'omp_init_nest_lock_'::omp_init_nest_lock
-!dec$ attributes alias:'omp_init_nest_lock_with_hint_'::omp_init_nest_lock_with_hint
-!dec$ attributes alias:'omp_destroy_nest_lock_'::omp_destroy_nest_lock
-!dec$ attributes alias:'omp_set_nest_lock_'::omp_set_nest_lock
-!dec$ attributes alias:'omp_unset_nest_lock_'::omp_unset_nest_lock
-!dec$ attributes alias:'omp_test_nest_lock_'::omp_test_nest_lock
-!dec$ attributes alias:'omp_control_tool_'::omp_control_tool
-
-!dec$ attributes alias:'kmp_set_stacksize_'::kmp_set_stacksize
-!dec$ attributes alias:'kmp_set_stacksize_s_'::kmp_set_stacksize_s
-!dec$ attributes alias:'kmp_set_blocktime_'::kmp_set_blocktime
-!dec$ attributes alias:'kmp_set_library_serial_'::kmp_set_library_serial
-!dec$ attributes alias:'kmp_set_library_turnaround_'::kmp_set_library_turnaround
-!dec$ attributes alias:'kmp_set_library_throughput_'::kmp_set_library_throughput
-!dec$ attributes alias:'kmp_set_library_'::kmp_set_library
-!dec$ attributes alias:'kmp_get_stacksize_'::kmp_get_stacksize
-!dec$ attributes alias:'kmp_get_stacksize_s_'::kmp_get_stacksize_s
-!dec$ attributes alias:'kmp_get_blocktime_'::kmp_get_blocktime
-!dec$ attributes alias:'kmp_get_library_'::kmp_get_library
-!dec$ attributes alias:'kmp_set_affinity_'::kmp_set_affinity
-!dec$ attributes alias:'kmp_get_affinity_'::kmp_get_affinity
-!dec$ attributes alias:'kmp_get_affinity_max_proc_'::kmp_get_affinity_max_proc
-!dec$ attributes alias:'kmp_create_affinity_mask_'::kmp_create_affinity_mask
-!dec$ attributes alias:'kmp_destroy_affinity_mask_'::kmp_destroy_affinity_mask
-!dec$ attributes alias:'kmp_set_affinity_mask_proc_'::kmp_set_affinity_mask_proc
-!dec$ attributes alias:'kmp_unset_affinity_mask_proc_'::kmp_unset_affinity_mask_proc
-!dec$ attributes alias:'kmp_get_affinity_mask_proc_'::kmp_get_affinity_mask_proc
-!dec$ attributes alias:'kmp_malloc_'::kmp_malloc
-!dec$ attributes alias:'kmp_aligned_malloc_'::kmp_aligned_malloc
-!dec$ attributes alias:'kmp_calloc_'::kmp_calloc
-!dec$ attributes alias:'kmp_realloc_'::kmp_realloc
-!dec$ attributes alias:'kmp_free_'::kmp_free
-
-!dec$ attributes alias:'kmp_set_warnings_on_'::kmp_set_warnings_on
-!dec$ attributes alias:'kmp_set_warnings_off_'::kmp_set_warnings_off
-!dec$ attributes alias:'kmp_get_cancellation_status_'::kmp_get_cancellation_status
-
-!dec$ endif
-
-!dec$ if defined(__APPLE__)
-
-!***
-!*** The Mac entry points are in lowercase, with an both an underscore
-!*** appended and an underscore prepended.
-!***
-
-!dec$ attributes alias:'_omp_set_num_threads_'::omp_set_num_threads
-!dec$ attributes alias:'_omp_set_dynamic_'::omp_set_dynamic
-!dec$ attributes alias:'_omp_set_nested_'::omp_set_nested
-!dec$ attributes alias:'_omp_get_num_threads_'::omp_get_num_threads
-!dec$ attributes alias:'_omp_get_max_threads_'::omp_get_max_threads
-!dec$ attributes alias:'_omp_get_thread_num_'::omp_get_thread_num
-!dec$ attributes alias:'_omp_get_num_procs_'::omp_get_num_procs
-!dec$ attributes alias:'_omp_in_parallel_'::omp_in_parallel
-!dec$ attributes alias:'_omp_get_dynamic_'::omp_get_dynamic
-!dec$ attributes alias:'_omp_get_nested_'::omp_get_nested
-!dec$ attributes alias:'_omp_get_thread_limit_'::omp_get_thread_limit
-!dec$ attributes alias:'_omp_set_max_active_levels_'::omp_set_max_active_levels
-!dec$ attributes alias:'_omp_get_max_active_levels_'::omp_get_max_active_levels
-!dec$ attributes alias:'_omp_get_level_'::omp_get_level
-!dec$ attributes alias:'_omp_get_active_level_'::omp_get_active_level
-!dec$ attributes alias:'_omp_get_ancestor_thread_num_'::omp_get_ancestor_thread_num
-!dec$ attributes alias:'_omp_get_team_size_'::omp_get_team_size
-!dec$ attributes alias:'_omp_set_schedule_'::omp_set_schedule
-!dec$ attributes alias:'_omp_get_schedule_'::omp_get_schedule
-!dec$ attributes alias:'_omp_get_proc_bind_' :: omp_get_proc_bind
-!dec$ attributes alias:'_omp_get_wtime_'::omp_get_wtime
-!dec$ attributes alias:'_omp_get_wtick_'::omp_get_wtick
-!dec$ attributes alias:'_omp_get_default_device_'::omp_get_default_device
-!dec$ attributes alias:'_omp_set_default_device_'::omp_set_default_device
-!dec$ attributes alias:'_omp_get_num_devices_'::omp_get_num_devices
-!dec$ attributes alias:'_omp_get_num_teams_'::omp_get_num_teams
-!dec$ attributes alias:'_omp_get_team_num_'::omp_get_team_num
-!dec$ attributes alias:'_omp_get_cancellation_'::omp_get_cancellation
-!dec$ attributes alias:'_omp_is_initial_device_'::omp_is_initial_device
-!dec$ attributes alias:'_omp_get_initial_device_'::omp_get_initial_device
-!dec$ attributes alias:'_omp_get_max_task_priorty_'::omp_get_max_task_priority
-!dec$ attributes alias:'_omp_get_device_num_'::omp_get_device_num
-!dec$ attributes alias:'_omp_pause_resource_' :: omp_pause_resource
-!dec$ attributes alias:'_omp_pause_resource_all_' :: omp_pause_resource_all
-!dec$ attributes alias:'_omp_get_supported_active_levels_' :: omp_get_supported_active_levels
-!dec$ attributes alias:'_omp_fulfill_event_' :: omp_fulfill_event
-
-!dec$ attributes alias:'_omp_init_lock_'::omp_init_lock
-!dec$ attributes alias:'_omp_init_lock_with_hint_'::omp_init_lock_with_hint
-!dec$ attributes alias:'_omp_destroy_lock_'::omp_destroy_lock
-!dec$ attributes alias:'_omp_set_lock_'::omp_set_lock
-!dec$ attributes alias:'_omp_unset_lock_'::omp_unset_lock
-!dec$ attributes alias:'_omp_test_lock_'::omp_test_lock
-!dec$ attributes alias:'_omp_init_nest_lock_'::omp_init_nest_lock
-!dec$ attributes alias:'_omp_init_nest_lock_with_hint_'::omp_init_nest_lock_with_hint
-!dec$ attributes alias:'_omp_destroy_nest_lock_'::omp_destroy_nest_lock
-!dec$ attributes alias:'_omp_set_nest_lock_'::omp_set_nest_lock
-!dec$ attributes alias:'_omp_unset_nest_lock_'::omp_unset_nest_lock
-!dec$ attributes alias:'_omp_test_nest_lock_'::omp_test_nest_lock
-!dec$ attributes alias:'_omp_control_tool_'::omp_control_tool
-!dec$ attributes alias:'_omp_set_affinity_format_' :: omp_set_affinity_format
-!dec$ attributes alias:'_omp_get_affinity_format_' :: omp_get_affinity_format
-!dec$ attributes alias:'_omp_display_affinity_' :: omp_display_affinity
-!dec$ attributes alias:'_omp_capture_affinity_' :: omp_capture_affinity
-
-!dec$ attributes alias:'_kmp_set_stacksize_'::kmp_set_stacksize
-!dec$ attributes alias:'_kmp_set_stacksize_s_'::kmp_set_stacksize_s
-!dec$ attributes alias:'_kmp_set_blocktime_'::kmp_set_blocktime
-!dec$ attributes alias:'_kmp_set_library_serial_'::kmp_set_library_serial
-!dec$ attributes alias:'_kmp_set_library_turnaround_'::kmp_set_library_turnaround
-!dec$ attributes alias:'_kmp_set_library_throughput_'::kmp_set_library_throughput
-!dec$ attributes alias:'_kmp_set_library_'::kmp_set_library
-!dec$ attributes alias:'_kmp_get_stacksize_'::kmp_get_stacksize
-!dec$ attributes alias:'_kmp_get_stacksize_s_'::kmp_get_stacksize_s
-!dec$ attributes alias:'_kmp_get_blocktime_'::kmp_get_blocktime
-!dec$ attributes alias:'_kmp_get_library_'::kmp_get_library
-!dec$ attributes alias:'_kmp_set_affinity_'::kmp_set_affinity
-!dec$ attributes alias:'_kmp_get_affinity_'::kmp_get_affinity
-!dec$ attributes alias:'_kmp_get_affinity_max_proc_'::kmp_get_affinity_max_proc
-!dec$ attributes alias:'_kmp_create_affinity_mask_'::kmp_create_affinity_mask
-!dec$ attributes alias:'_kmp_destroy_affinity_mask_'::kmp_destroy_affinity_mask
-!dec$ attributes alias:'_kmp_set_affinity_mask_proc_'::kmp_set_affinity_mask_proc
-!dec$ attributes alias:'_kmp_unset_affinity_mask_proc_'::kmp_unset_affinity_mask_proc
-!dec$ attributes alias:'_kmp_get_affinity_mask_proc_'::kmp_get_affinity_mask_proc
-!dec$ attributes alias:'_kmp_malloc_'::kmp_malloc
-!dec$ attributes alias:'_kmp_aligned_malloc_'::kmp_aligned_malloc
-!dec$ attributes alias:'_kmp_calloc_'::kmp_calloc
-!dec$ attributes alias:'_kmp_realloc_'::kmp_realloc
-!dec$ attributes alias:'_kmp_free_'::kmp_free
-
-!dec$ attributes alias:'_kmp_set_warnings_on_'::kmp_set_warnings_on
-!dec$ attributes alias:'_kmp_set_warnings_off_'::kmp_set_warnings_off
-
-!dec$ attributes alias:'_kmp_get_cancellation_status_'::kmp_get_cancellation_status
-
-!dec$ endif
-
-      end module omp_lib
diff --git a/runtime/src/include/omp_lib.f90.var b/runtime/src/include/omp_lib.f90.var
index ac568486d..01b60fa5d 100644
--- a/runtime/src/include/omp_lib.f90.var
+++ b/runtime/src/include/omp_lib.f90.var
@@ -50,10 +50,16 @@
 
         use omp_lib_kinds
 
-        integer (kind=omp_integer_kind), parameter :: openmp_version    = @LIBOMP_OMP_YEAR_MONTH@
-        integer (kind=omp_integer_kind), parameter :: kmp_version_major = @LIBOMP_VERSION_MAJOR@
-        integer (kind=omp_integer_kind), parameter :: kmp_version_minor = @LIBOMP_VERSION_MINOR@
-        integer (kind=omp_integer_kind), parameter :: kmp_version_build = @LIBOMP_VERSION_BUILD@
+        character(*)               bolt_version
+        parameter( bolt_version = '@BOLT_VERSION@' )
+        integer (kind=omp_integer_kind), parameter :: bolt_numversion = @BOLT_NUMVERSION@
+        character(*)               bolt_release_date
+        parameter( bolt_release_date = '@BOLT_RELEASE_DATE@' )
+
+        integer (kind=omp_integer_kind), parameter :: openmp_version    = @LIBBOLT_OMP_YEAR_MONTH@
+        integer (kind=omp_integer_kind), parameter :: kmp_version_major = @LIBBOLT_VERSION_MAJOR@
+        integer (kind=omp_integer_kind), parameter :: kmp_version_minor = @LIBBOLT_VERSION_MINOR@
+        integer (kind=omp_integer_kind), parameter :: kmp_version_build = @LIBBOLT_VERSION_BUILD@
         character(*)               kmp_build_date
         parameter( kmp_build_date = '@LIBOMP_BUILD_DATE@' )
 
@@ -98,7 +104,7 @@
         integer (kind=omp_control_tool_result_kind), parameter :: omp_control_tool_success = 0
         integer (kind=omp_control_tool_result_kind), parameter :: omp_control_tool_ignored = 1
 
-        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_threadmodel = 1
+        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_sync_hint = 1
         integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_alignment = 2
         integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_access = 3
         integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_pool_size = 4
@@ -107,12 +113,13 @@
         integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_pinned = 7
         integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_partition = 8
 
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_default = -1
         integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_false = 0
         integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_true = 1
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_default = 2
         integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_contended = 3
         integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_uncontended = 4
-        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_sequential = 5
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_serialized = 5
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_sequential = omp_atv_serialized
         integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_private = 6
         integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_all = 7
         integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_thread = 8
@@ -503,6 +510,103 @@
             integer (kind=kmp_size_t_kind) :: omp_capture_affinity
           end function omp_capture_affinity
 
+          subroutine omp_display_env(verbose) bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind), value :: verbose
+          end subroutine omp_display_env
+
+
+          function omp_target_alloc(size, device_num) bind(c)
+            use omp_lib_kinds
+            type(c_ptr) omp_target_alloc
+            integer(c_size_t), value :: size
+            integer(c_int), value :: device_num
+          end function omp_target_alloc
+
+          subroutine omp_target_free(device_ptr, device_num) bind(c)
+            use omp_lib_kinds
+            type(c_ptr), value :: device_ptr
+            integer(c_int), value :: device_num
+          end subroutine omp_target_free
+
+          function omp_target_is_present(ptr, device_num) bind(c)
+            use omp_lib_kinds
+            integer(c_int) omp_target_is_present
+            type(c_ptr), value :: ptr
+            integer(c_int), value :: device_num
+          end function omp_target_is_present
+
+          function omp_target_memcpy(dst, src, length, dst_offset, src_offset, &
+              dst_device_num, src_device_num) bind(c)
+            use omp_lib_kinds
+            integer(c_int) omp_target_memcpy
+            type(c_ptr), value :: dst, src
+            integer(c_size_t), value :: length, dst_offset, src_offset
+            integer(c_int), value :: dst_device_num, src_device_num
+          end function omp_target_memcpy
+
+          function omp_target_memcpy_rect(dst, src, element_size, num_dims,    &
+              volume, dst_offsets, src_offsets, dst_dimensions,                &
+              src_dimensions, dst_device_num, src_device_num) bind(c)
+            use omp_lib_kinds
+            integer(c_int) omp_target_memcpy_rect
+            type(c_ptr), value :: dst, src
+            integer(c_size_t), value :: element_size
+            integer(c_int), value :: num_dims, dst_device_num, src_device_num
+            integer(c_size_t), intent(in) :: volume(*), dst_offsets(*),        &
+                src_offsets(*), dst_dimensions(*), src_dimensions(*)
+          end function omp_target_memcpy_rect
+
+          function omp_target_memcpy_async(dst, src, length, dst_offset,       &
+              src_offset, dst_device_num, src_device_num, depobj_count,        &
+              depobj_list) bind(c)
+            use omp_lib_kinds
+            integer(c_int) omp_target_memcpy_async
+            type(c_ptr), value :: dst, src
+            integer(c_size_t), value :: length, dst_offset, src_offset
+            integer(c_int), value :: dst_device_num, src_device_num,           &
+                depobj_count
+            integer(omp_depend_kind), optional :: depobj_list(*)
+          end function omp_target_memcpy_async
+
+          function omp_target_memcpy_rect_async(dst, src, element_size,        &
+              num_dims, volume, dst_offsets, src_offsets, dst_dimensions,      &
+              src_dimensions, dst_device_num, src_device_num, depobj_count,    &
+              depobj_list) bind(c)
+            use omp_lib_kinds
+            integer(c_int) omp_target_memcpy_rect_async
+            type(c_ptr), value :: dst, src
+            integer(c_size_t), value :: element_size
+            integer(c_int), value :: num_dims, dst_device_num, src_device_num, &
+                depobj_count
+            integer(c_size_t), intent(in) :: volume(*), dst_offsets(*),        &
+                src_offsets(*), dst_dimensions(*), src_dimensions(*)
+            integer (omp_depend_kind), optional :: depobj_list(*)
+          end function omp_target_memcpy_rect_async
+
+          function omp_target_associate_ptr(host_ptr, device_ptr, size,        &
+              device_offset, device_num) bind(c)
+            use omp_lib_kinds
+            integer(c_int) omp_target_associate_ptr
+            type(c_ptr), value :: host_ptr, device_ptr
+            integer(c_size_t), value :: size, device_offset
+            integer(c_int), value :: device_num
+          end function omp_target_associate_ptr
+
+          function omp_get_mapped_ptr(ptr, device_num) bind(c)
+            use omp_lib_kinds
+            type(c_ptr) omp_get_mapped_ptr
+            type(c_ptr), value :: ptr
+            integer(c_int), value :: device_num
+          end function omp_get_mapped_ptr
+
+          function omp_target_disassociate_ptr(ptr, device_num) bind(c)
+            use omp_lib_kinds
+            integer(c_int) omp_target_disassociate_ptr
+            type(c_ptr), value :: ptr
+            integer(c_int), value :: device_num
+          end function omp_target_disassociate_ptr
+
 !         ***
 !         *** kmp_* entry points
 !         ***
diff --git a/runtime/src/include/omp_lib.h.var b/runtime/src/include/omp_lib.h.var
index 877512815..d3c0b0e01 100644
--- a/runtime/src/include/omp_lib.h.var
+++ b/runtime/src/include/omp_lib.h.var
@@ -53,14 +53,21 @@
       integer omp_event_handle_kind
       parameter(omp_event_handle_kind=int_ptr_kind())
 
+      character(*)bolt_version
+      parameter(bolt_version='@BOLT_VERSION@')
+      integer(kind=omp_integer_kind)bolt_numversion
+      parameter(bolt_numversion=@BOLT_NUMVERSION@)
+      character(*)bolt_release_date
+      parameter(bolt_release_date='@BOLT_RELEASE_DATE@')
+
       integer(kind=omp_integer_kind)openmp_version
-      parameter(openmp_version=@LIBOMP_OMP_YEAR_MONTH@)
+      parameter(openmp_version=@LIBBOLT_OMP_YEAR_MONTH@)
       integer(kind=omp_integer_kind)kmp_version_major
-      parameter(kmp_version_major=@LIBOMP_VERSION_MAJOR@)
+      parameter(kmp_version_major=@LIBBOLT_VERSION_MAJOR@)
       integer(kind=omp_integer_kind)kmp_version_minor
-      parameter(kmp_version_minor=@LIBOMP_VERSION_MINOR@)
+      parameter(kmp_version_minor=@LIBBOLT_VERSION_MINOR@)
       integer(kind=omp_integer_kind)kmp_version_build
-      parameter(kmp_version_build=@LIBOMP_VERSION_BUILD@)
+      parameter(kmp_version_build=@LIBBOLT_VERSION_BUILD@)
       character(*)kmp_build_date
       parameter(kmp_build_date='@LIBOMP_BUILD_DATE@')
 
@@ -131,8 +138,8 @@
       integer(omp_control_tool_result_kind)omp_control_tool_ignored
       parameter(omp_control_tool_ignored=1)
 
-      integer(kind=omp_alloctrait_key_kind)omp_atk_threadmodel
-      parameter(omp_atk_threadmodel=1)
+      integer(kind=omp_alloctrait_key_kind)omp_atk_sync_hint
+      parameter(omp_atk_sync_hint=1)
       integer(kind=omp_alloctrait_key_kind)omp_atk_alignment
       parameter(omp_atk_alignment=2)
       integer(kind=omp_alloctrait_key_kind)omp_atk_access
@@ -148,18 +155,20 @@
       integer(kind=omp_alloctrait_key_kind)omp_atk_partition
       parameter(omp_atk_partition=8)
 
+      integer(kind=omp_alloctrait_val_kind)omp_atv_default
+      parameter(omp_atv_default=-1)
       ! Reserved for future use
       integer(kind=omp_alloctrait_val_kind)omp_atv_false
       parameter(omp_atv_false=0)
       ! Reserved for future use
       integer(kind=omp_alloctrait_val_kind)omp_atv_true
       parameter(omp_atv_true=1)
-      integer(kind=omp_alloctrait_val_kind)omp_atv_default
-      parameter(omp_atv_default=2)
       integer(kind=omp_alloctrait_val_kind)omp_atv_contended
       parameter(omp_atv_contended=3)
       integer(kind=omp_alloctrait_val_kind)omp_atv_uncontended
       parameter(omp_atv_uncontended=4)
+      integer(kind=omp_alloctrait_val_kind)omp_atv_serialized
+      parameter(omp_atv_serialized=5)
       integer(kind=omp_alloctrait_val_kind)omp_atv_sequential
       parameter(omp_atv_sequential=5)
       integer(kind=omp_alloctrait_val_kind)omp_atv_private
@@ -580,6 +589,105 @@
           integer (kind=kmp_size_t_kind) :: omp_capture_affinity
         end function omp_capture_affinity
 
+        subroutine omp_display_env(verbose) bind(c)
+          import
+          logical (kind=omp_logical_kind), value :: verbose
+        end subroutine omp_display_env
+
+        function omp_target_alloc(size, device_num) bind(c)
+          use, intrinsic :: iso_c_binding, only : c_ptr, c_size_t, c_int
+          type(c_ptr) omp_target_alloc
+          integer(c_size_t), value :: size
+          integer(c_int), value :: device_num
+        end function omp_target_alloc
+
+        subroutine omp_target_free(device_ptr, device_num) bind(c)
+          use, intrinsic :: iso_c_binding, only : c_ptr, c_int
+          type(c_ptr), value :: device_ptr
+          integer(c_int), value :: device_num
+        end subroutine omp_target_free
+
+        function omp_target_is_present(ptr, device_num) bind(c)
+          use, intrinsic :: iso_c_binding, only : c_ptr, c_int
+          integer(c_int) omp_target_is_present
+          type(c_ptr), value :: ptr
+          integer(c_int), value :: device_num
+        end function omp_target_is_present
+
+        function omp_target_memcpy(dst, src, length, dst_offset,                                                                    &
+     &      src_offset, dst_device_num, src_device_num) bind(c)
+          use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+          integer(c_int) omp_target_memcpy
+          type(c_ptr), value :: dst, src
+          integer(c_size_t), value :: length, dst_offset, src_offset
+          integer(c_int), value :: dst_device_num, src_device_num
+        end function omp_target_memcpy
+
+        function omp_target_memcpy_rect(dst, src, element_size,                                                                     &
+     &      num_dims, volume, dst_offsets, src_offsets, dst_dimensions,                                                             &
+     &      src_dimensions, dst_device_num, src_device_num) bind(c)
+          use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+          integer(c_int) omp_target_memcpy_rect
+          type(c_ptr), value :: dst, src
+          integer(c_size_t), value :: element_size
+          integer(c_int), value :: num_dims, dst_device_num,                                                                        &
+     &        src_device_num
+          integer(c_size_t), intent(in) :: volume(*), dst_offsets(*),                                                               &
+     &        src_offsets(*), dst_dimensions(*), src_dimensions(*)
+        end function omp_target_memcpy_rect
+
+        function omp_target_memcpy_async(dst, src, length, dst_offset,                                                              &
+     &      src_offset, dst_device_num, src_device_num, depobj_count,                                                               &
+     &      depobj_list) bind(c)
+          use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+          import
+          integer(c_int) omp_target_memcpy_async
+          type(c_ptr), value :: dst, src
+          integer(c_size_t), value :: length, dst_offset, src_offset
+          integer(c_int), value :: dst_device_num, src_device_num,                                                                  &
+     &        depobj_count
+          integer(omp_depend_kind), optional :: depobj_list(*)
+        end function omp_target_memcpy_async
+
+        function omp_target_memcpy_rect_async(dst, src, element_size,                                                               &
+     &      num_dims, volume, dst_offsets, src_offsets, dst_dimensions,                                                             &
+     &      src_dimensions, dst_device_num, src_device_num,                                                                         &
+     &      depobj_count, depobj_list) bind(c)
+          use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+          import
+          integer(c_int) omp_target_memcpy_rect_async
+          type(c_ptr), value :: dst, src
+          integer(c_size_t), value :: element_size
+          integer(c_int), value :: num_dims, dst_device_num,                                                                        &
+     &        src_device_num, depobj_count
+          integer(c_size_t), intent(in) :: volume(*), dst_offsets(*),                                                               &
+     &        src_offsets(*), dst_dimensions(*), src_dimensions(*)
+          integer(omp_depend_kind), optional :: depobj_list(*)
+        end function omp_target_memcpy_rect_async
+
+        function omp_target_associate_ptr(host_ptr, device_ptr, size,                                                               &
+     &      device_offset, device_num) bind(c)
+          use, intrinsic :: iso_c_binding, only : c_ptr, c_size_t, c_int
+          integer(c_int) omp_target_associate_ptr
+          type(c_ptr), value :: host_ptr, device_ptr
+          integer(c_size_t), value :: size, device_offset
+          integer(c_int), value :: device_num
+        end function omp_target_associate_ptr
+
+        function omp_get_mapped_ptr(ptr, device_num) bind(c)
+          use, intrinsic :: iso_c_binding, only : c_ptr, c_int
+          type(c_ptr) omp_get_mapped_ptr
+          type(c_ptr), value :: ptr
+          integer(c_int), value :: device_num
+        end function omp_get_mapped_ptr
+
+        function omp_target_disassociate_ptr(ptr, device_num) bind(c)
+          use, intrinsic :: iso_c_binding, only : c_ptr, c_int
+          integer(c_int) omp_target_disassociate_ptr
+          type(c_ptr), value :: ptr
+          integer(c_int), value :: device_num
+        end function omp_target_disassociate_ptr
+
 !       ***
 !       *** kmp_* entry points
 !       ***
diff --git a/runtime/src/kmp.h b/runtime/src/kmp.h
index 23eebe673..273a66e8e 100644
--- a/runtime/src/kmp.h
+++ b/runtime/src/kmp.h
@@ -18,6 +18,10 @@
 
 /* #define BUILD_PARALLEL_ORDERED 1 */
 
+#if KMP_USE_ABT
+#include "kmp_abt.h"
+#endif
+
 /* This fix replaces gettimeofday with clock_gettime for better scalability on
    the Altix.  Requires user code to be linked with -lrt. */
 //#define FIX_SGI_CLOCK
@@ -66,6 +70,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <limits>
+#include <type_traits>
 /* include <ctype.h> don't use; problems with /MD on Windows* OS NT due to bad
    Microsoft library. Some macros provided below to replace these functions  */
 #ifndef __ABSOFT_WIN
@@ -118,7 +124,7 @@ typedef unsigned int kmp_hwloc_depth_t;
 #endif
 #include "kmp_i18n.h"
 
-#define KMP_HANDLE_SIGNALS (KMP_OS_UNIX || KMP_OS_WINDOWS)
+#define KMP_HANDLE_SIGNALS ((KMP_OS_UNIX || KMP_OS_WINDOWS) && !KMP_USE_ABT)
 
 #include "kmp_wrapper_malloc.h"
 #if KMP_OS_UNIX
@@ -136,6 +142,10 @@ typedef unsigned int kmp_hwloc_depth_t;
 #include "ompt-internal.h"
 #endif
 
+#ifndef UNLIKELY
+#define UNLIKELY(x) (x)
+#endif
+
 // Affinity format function
 #include "kmp_str.h"
 
@@ -203,7 +213,7 @@ enum {
   KMP_IDENT_WORK_LOOP = 0x200,
   /*! To mark a sections directive in OMPT callbacks */
   KMP_IDENT_WORK_SECTIONS = 0x400,
-  /*! To mark a distirbute construct in OMPT callbacks */
+  /*! To mark a distribute construct in OMPT callbacks */
   KMP_IDENT_WORK_DISTRIBUTE = 0x800,
   /*! Atomic hint; bottom four bits as omp_sync_hint_t. Top four reserved and
       not currently used. If one day we need more bits, then we can use
@@ -214,6 +224,7 @@ enum {
   KMP_IDENT_ATOMIC_HINT_CONTENDED = 0x020000,
   KMP_IDENT_ATOMIC_HINT_NONSPECULATIVE = 0x040000,
   KMP_IDENT_ATOMIC_HINT_SPECULATIVE = 0x080000,
+  KMP_IDENT_OPENMP_SPEC_VERSION_MASK = 0xFF000000
 };
 
 /*!
@@ -233,6 +244,10 @@ typedef struct ident {
                        The string is composed of semi-colon separated fields
                        which describe the source file, the function and a pair
                        of line numbers that delimit the construct. */
+  // Returns the OpenMP version in form major*10+minor (e.g., 50 for 5.0)
+  kmp_int32 get_openmp_version() {
+    return (((flags & KMP_IDENT_OPENMP_SPEC_VERSION_MASK) >> 24) & 0xFF);
+  }
 } ident_t;
 /*!
 @}
@@ -246,6 +261,10 @@ typedef union kmp_team kmp_team_p;
 typedef union kmp_info kmp_info_p;
 typedef union kmp_root kmp_root_p;
 
+template <bool C = false, bool S = true> class kmp_flag_32;
+template <bool C = false, bool S = true> class kmp_flag_64;
+class kmp_flag_oncore;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -681,6 +700,9 @@ class KMPAffinity {
     virtual int begin() const { return 0; }
     virtual int end() const { return 0; }
     virtual int next(int previous) const { return 0; }
+#if KMP_OS_WINDOWS
+    virtual int set_process_affinity(bool abort_on_error) const { return -1; }
+#endif
     // Set the system's affinity to this affinity mask's value
     virtual int set_system_affinity(bool abort_on_error) const { return -1; }
     // Set this affinity mask to the current system affinity
@@ -803,7 +825,10 @@ typedef enum kmp_proc_bind_t {
   proc_bind_close,
   proc_bind_spread,
   proc_bind_intel, // use KMP_AFFINITY interface
-  proc_bind_default
+  proc_bind_default,
+#if KMP_USE_ABT
+  proc_bind_unset
+#endif
 } kmp_proc_bind_t;
 
 typedef struct kmp_nested_proc_bind_t {
@@ -868,41 +893,42 @@ extern int __kmp_hws_abs_flag; // absolute or per-item number requested
 /* OpenMP 5.0 Memory Management support */
 
 #ifndef __OMP_H
-// Duplicate type definitios from omp.h
+// Duplicate type definitions from omp.h
 typedef uintptr_t omp_uintptr_t;
 
 typedef enum {
-  OMP_ATK_THREADMODEL = 1,
-  OMP_ATK_ALIGNMENT = 2,
-  OMP_ATK_ACCESS = 3,
-  OMP_ATK_POOL_SIZE = 4,
-  OMP_ATK_FALLBACK = 5,
-  OMP_ATK_FB_DATA = 6,
-  OMP_ATK_PINNED = 7,
-  OMP_ATK_PARTITION = 8
+  omp_atk_sync_hint = 1,
+  omp_atk_alignment = 2,
+  omp_atk_access = 3,
+  omp_atk_pool_size = 4,
+  omp_atk_fallback = 5,
+  omp_atk_fb_data = 6,
+  omp_atk_pinned = 7,
+  omp_atk_partition = 8
 } omp_alloctrait_key_t;
 
 typedef enum {
-  OMP_ATV_FALSE = 0,
-  OMP_ATV_TRUE = 1,
-  OMP_ATV_DEFAULT = 2,
-  OMP_ATV_CONTENDED = 3,
-  OMP_ATV_UNCONTENDED = 4,
-  OMP_ATV_SEQUENTIAL = 5,
-  OMP_ATV_PRIVATE = 6,
-  OMP_ATV_ALL = 7,
-  OMP_ATV_THREAD = 8,
-  OMP_ATV_PTEAM = 9,
-  OMP_ATV_CGROUP = 10,
-  OMP_ATV_DEFAULT_MEM_FB = 11,
-  OMP_ATV_NULL_FB = 12,
-  OMP_ATV_ABORT_FB = 13,
-  OMP_ATV_ALLOCATOR_FB = 14,
-  OMP_ATV_ENVIRONMENT = 15,
-  OMP_ATV_NEAREST = 16,
-  OMP_ATV_BLOCKED = 17,
-  OMP_ATV_INTERLEAVED = 18
+  omp_atv_false = 0,
+  omp_atv_true = 1,
+  omp_atv_contended = 3,
+  omp_atv_uncontended = 4,
+  omp_atv_serialized = 5,
+  omp_atv_sequential = omp_atv_serialized, // (deprecated)
+  omp_atv_private = 6,
+  omp_atv_all = 7,
+  omp_atv_thread = 8,
+  omp_atv_pteam = 9,
+  omp_atv_cgroup = 10,
+  omp_atv_default_mem_fb = 11,
+  omp_atv_null_fb = 12,
+  omp_atv_abort_fb = 13,
+  omp_atv_allocator_fb = 14,
+  omp_atv_environment = 15,
+  omp_atv_nearest = 16,
+  omp_atv_blocked = 17,
+  omp_atv_interleaved = 18
 } omp_alloctrait_value_t;
+#define omp_atv_default ((omp_uintptr_t)-1)
 
 typedef void *omp_memspace_handle_t;
 extern omp_memspace_handle_t const omp_default_mem_space;
@@ -929,7 +955,7 @@ extern omp_allocator_handle_t const omp_thread_mem_alloc;
 extern omp_allocator_handle_t const kmp_max_mem_alloc;
 extern omp_allocator_handle_t __kmp_def_allocator;
 
-// end of duplicate type definitios from omp.h
+// end of duplicate type definitions from omp.h
 #endif
 
 extern int __kmp_memkind_available;
@@ -954,6 +980,11 @@ extern void __kmpc_destroy_allocator(int gtid, omp_allocator_handle_t al);
 extern void __kmpc_set_default_allocator(int gtid, omp_allocator_handle_t al);
 extern omp_allocator_handle_t __kmpc_get_default_allocator(int gtid);
 extern void *__kmpc_alloc(int gtid, size_t sz, omp_allocator_handle_t al);
+extern void *__kmpc_calloc(int gtid, size_t nmemb, size_t sz,
+                           omp_allocator_handle_t al);
+extern void *__kmpc_realloc(int gtid, void *ptr, size_t sz,
+                            omp_allocator_handle_t al,
+                            omp_allocator_handle_t free_al);
 extern void __kmpc_free(int gtid, void *ptr, omp_allocator_handle_t al);
 
 extern void __kmp_init_memkind();
@@ -967,7 +998,8 @@ extern void __kmp_fini_memkind();
 #define KMP_MIN_NTH 1
 
 #ifndef KMP_MAX_NTH
-#if defined(PTHREAD_THREADS_MAX) && PTHREAD_THREADS_MAX < INT_MAX
+#if defined(PTHREAD_THREADS_MAX) && PTHREAD_THREADS_MAX < INT_MAX \
+    && !KMP_USE_ABT
 #define KMP_MAX_NTH PTHREAD_THREADS_MAX
 #else
 #define KMP_MAX_NTH INT_MAX
@@ -1095,12 +1127,12 @@ extern kmp_uint64 __kmp_now_nsec();
 #define KMP_TLS_GTID_MIN INT_MAX
 #endif
 
-#define KMP_MASTER_TID(tid) ((tid) == 0)
-#define KMP_WORKER_TID(tid) ((tid) != 0)
+#define KMP_MASTER_TID(tid) (0 == (tid))
+#define KMP_WORKER_TID(tid) (0 != (tid))
 
-#define KMP_MASTER_GTID(gtid) (__kmp_tid_from_gtid((gtid)) == 0)
-#define KMP_WORKER_GTID(gtid) (__kmp_tid_from_gtid((gtid)) != 0)
-#define KMP_INITIAL_GTID(gtid) ((gtid) == 0)
+#define KMP_MASTER_GTID(gtid) (0 == __kmp_tid_from_gtid((gtid)))
+#define KMP_WORKER_GTID(gtid) (0 != __kmp_tid_from_gtid((gtid)))
+#define KMP_INITIAL_GTID(gtid) (0 == (gtid))
 
 #ifndef TRUE
 #define FALSE 0
@@ -1112,9 +1144,6 @@ extern kmp_uint64 __kmp_now_nsec();
 #if KMP_OS_WINDOWS
 #define KMP_INIT_WAIT 64U /* initial number of spin-tests   */
 #define KMP_NEXT_WAIT 32U /* susequent number of spin-tests */
-#elif KMP_OS_CNK
-#define KMP_INIT_WAIT 16U /* initial number of spin-tests   */
-#define KMP_NEXT_WAIT 8U /* susequent number of spin-tests */
 #elif KMP_OS_LINUX
 #define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
 #define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
@@ -1307,6 +1336,84 @@ static inline void __kmp_x86_pause(void) { _mm_pause(); }
     }                                                                          \
   }
 
+// User-level Monitor/Mwait
+#if KMP_HAVE_UMWAIT
+// We always try for UMWAIT first
+#if KMP_HAVE_WAITPKG_INTRINSICS
+#if KMP_HAVE_IMMINTRIN_H
+#include <immintrin.h>
+#elif KMP_HAVE_INTRIN_H
+#include <intrin.h>
+#endif
+#endif // KMP_HAVE_WAITPKG_INTRINSICS
+KMP_ATTRIBUTE_TARGET_WAITPKG
+static inline int
+__kmp_tpause(uint32_t hint, uint64_t counter) {
+#if !KMP_HAVE_WAITPKG_INTRINSICS
+  uint32_t timeHi = uint32_t(counter >> 32);
+  uint32_t timeLo = uint32_t(counter & 0xffffffff);
+  char flag;
+  __asm__ volatile("#tpause\n.byte 0x66, 0x0F, 0xAE, 0xF1\n"
+                   "setb   %0"
+                   : "=r"(flag)
+                   : "a"(timeLo), "d"(timeHi), "c"(hint)
+                   :);
+  return flag;
+#else
+  return _tpause(hint, counter);
+#endif
+}
+KMP_ATTRIBUTE_TARGET_WAITPKG
+static inline void
+__kmp_umonitor(void *cacheline) {
+#if !KMP_HAVE_WAITPKG_INTRINSICS
+  __asm__ volatile("# umonitor\n.byte 0xF3, 0x0F, 0xAE, 0x01 "
+                   :
+                   : "a"(cacheline)
+                   :);
+#else
+  _umonitor(cacheline);
+#endif
+}
+KMP_ATTRIBUTE_TARGET_WAITPKG
+static inline int
+__kmp_umwait(uint32_t hint, uint64_t counter) {
+#if !KMP_HAVE_WAITPKG_INTRINSICS
+  uint32_t timeHi = uint32_t(counter >> 32);
+  uint32_t timeLo = uint32_t(counter & 0xffffffff);
+  char flag;
+  __asm__ volatile("#umwait\n.byte 0xF2, 0x0F, 0xAE, 0xF1\n"
+                   "setb   %0"
+                   : "=r"(flag)
+                   : "a"(timeLo), "d"(timeHi), "c"(hint)
+                   :);
+  return flag;
+#else
+  return _umwait(hint, counter);
+#endif
+}
+#elif KMP_HAVE_MWAIT
+#if KMP_OS_UNIX
+#include <pmmintrin.h>
+#else
+#include <intrin.h>
+#endif
+#if KMP_OS_UNIX
+__attribute__((target("sse3")))
+#endif
+static inline void
+__kmp_mm_monitor(void *cacheline, unsigned extensions, unsigned hints) {
+  _mm_monitor(cacheline, extensions, hints);
+}
+#if KMP_OS_UNIX
+__attribute__((target("sse3")))
+#endif
+static inline void
+__kmp_mm_mwait(unsigned extensions, unsigned hints) {
+  _mm_mwait(extensions, hints);
+}
+#endif // KMP_HAVE_UMWAIT
+
 /* ------------------------------------------------------------------------ */
 /* Support datatypes for the orphaned construct nesting checks.             */
 /* ------------------------------------------------------------------------ */
@@ -1351,6 +1458,18 @@ struct kmp_region_info {
 /* ---------------------------------------------------------------------- */
 /* ---------------------------------------------------------------------- */
 
+#if KMP_USE_ABT
+
+typedef ABT_thread kmp_thread_t;
+typedef ABT_key kmp_key_t;
+typedef ABT_thread kmp_abt_task_t;
+typedef ABT_barrier kmp_barrier_t;
+typedef pthread_key_t kmp_pth_key_t;
+
+extern kmp_pth_key_t __kmp_gtid_threadprivate_key;
+
+#else
+
 #if KMP_OS_WINDOWS
 typedef HANDLE kmp_thread_t;
 typedef DWORD kmp_key_t;
@@ -1363,6 +1482,8 @@ typedef pthread_key_t kmp_key_t;
 
 extern kmp_key_t __kmp_gtid_threadprivate_key;
 
+#endif
+
 typedef struct kmp_sys_info {
   long maxrss; /* the maximum resident set size utilized (in kilobytes)     */
   long minflt; /* the number of page faults serviced without any I/O        */
@@ -1389,7 +1510,7 @@ The type for a microtask which gets passed to @ref __kmpc_fork_call().
 The arguments to the outlined function are
 @param global_tid the global thread identity of the thread executing the
 function.
-@param bound_tid  the local identitiy of the thread executing the function
+@param bound_tid  the local identity of the thread executing the function
 @param ... pointers to shared variables accessed by the function.
 */
 typedef void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid, ...);
@@ -1548,7 +1669,7 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
   kmp_int32 tc;
   kmp_int32 static_steal_counter; /* for static_steal only; maybe better to put
                                      after ub */
-
+  kmp_lock_t *th_steal_lock; // lock used for chunk stealing
   // KMP_ALIGN( 16 ) ensures ( if the KMP_ALIGN macro is turned on )
   //    a) parm3 is properly aligned and
   //    b) all parm1-4 are in the same cache line.
@@ -1581,7 +1702,7 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
   kmp_int64 tc; /* trip count (number of iterations) */
   kmp_int64 static_steal_counter; /* for static_steal only; maybe better to put
                                      after ub */
-
+  kmp_lock_t *th_steal_lock; // lock used for chunk stealing
   /* parm[1-4] are used in different ways by different scheduling algorithms */
 
   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
@@ -1722,11 +1843,7 @@ typedef struct kmp_disp {
   kmp_int32 th_disp_index;
   kmp_int32 th_doacross_buf_idx; // thread's doacross buffer index
   volatile kmp_uint32 *th_doacross_flags; // pointer to shared array of flags
-  union { // we can use union here because doacross cannot be used in
-    // nonmonotonic loops
-    kmp_int64 *th_doacross_info; // info on loop bounds
-    kmp_lock_t *th_steal_lock; // lock used for chunk stealing (8-byte variable)
-  };
+  kmp_int64 *th_doacross_info; // info on loop bounds
 #if KMP_USE_INTERNODE_ALIGNMENT
   char more_padding[INTERNODE_CACHE_LINE];
 #endif
@@ -2077,7 +2194,7 @@ extern kmp_uint64 __kmp_taskloop_min_tasks;
 // The tt_found_tasks flag is a signal to all threads in the team that tasks
 // were spawned and queued since the previous barrier release.
 #define KMP_TASKING_ENABLED(task_team)                                         \
-  (TCR_SYNC_4((task_team)->tt.tt_found_tasks) == TRUE)
+  (TRUE == TCR_SYNC_4((task_team)->tt.tt_found_tasks))
 /*!
 @ingroup BASIC_TYPES
 @{
@@ -2253,7 +2370,7 @@ typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
   unsigned started : 1; /* 1==started, 0==not started     */
   unsigned executing : 1; /* 1==executing, 0==not executing */
   unsigned complete : 1; /* 1==complete, 0==not complete   */
-  unsigned freed : 1; /* 1==freed, 0==allocateed        */
+  unsigned freed : 1; /* 1==freed, 0==allocated        */
   unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
   unsigned reserved31 : 7; /* reserved for library use */
 
@@ -2287,7 +2404,7 @@ struct kmp_taskdata { /* aligned during dynamic allocation       */
   kmp_depnode_t
       *td_depnode; // Pointer to graph node if this task has dependencies
   kmp_task_team_t *td_task_team;
-  kmp_int32 td_size_alloc; // The size of task structure, including shareds etc.
+  size_t td_size_alloc; // Size of task structure, including shareds etc.
 #if defined(KMP_GOMP_COMPAT)
   // 4 or 8 byte integers for the loop bounds in GOMP_taskloop
   kmp_int32 td_size_loop_bounds;
@@ -2301,6 +2418,11 @@ struct kmp_taskdata { /* aligned during dynamic allocation       */
 #if OMPT_SUPPORT
   ompt_task_info_t ompt_task_info;
 #endif
+#if KMP_USE_ABT
+  kmp_abt_task_t *td_task_queue; // child tasks
+  kmp_int32 td_tq_cur_size; // current size of td_task_queue
+  kmp_int32 td_tq_max_size; // maximum size of td_task_queue
+#endif
 }; // struct kmp_taskdata
 
 // Make sure padding above worked
@@ -2435,10 +2557,10 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   int th_teams_level; /* save initial level of teams construct */
 /* it is 0 on device but may be any on host */
 
-/* The blocktime info is copied from the team struct to the thread sruct */
-/* at the start of a barrier, and the values stored in the team are used */
-/* at points in the code where the team struct is no longer guaranteed   */
-/* to exist (from the POV of worker threads).                            */
+/* The blocktime info is copied from the team struct to the thread struct */
+/* at the start of a barrier, and the values stored in the team are used  */
+/* at points in the code where the team struct is no longer guaranteed    */
+/* to exist (from the POV of worker threads).                             */
 #if KMP_USE_MONITOR
   int th_team_bt_intervals;
   int th_team_bt_set;
@@ -2468,6 +2590,10 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
 #endif
   int th_prev_level; /* previous level for affinity format */
   int th_prev_num_threads; /* previous num_threads for affinity format */
+#if KMP_USE_ABT
+  int th_current_place_id; /* place id currently bound to */
+  int th_creation_group_end_tid; /* For N-way thread creation. */
+#endif /* KMP_USE_ABT */
 #if USE_ITT_BUILD
   kmp_uint64 th_bar_arrive_time; /* arrival to barrier timestamp */
   kmp_uint64 th_bar_min_time; /* minimum arrival time at the barrier */
@@ -2519,8 +2645,10 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   kmp_hier_private_bdata_t *th_hier_bar_data;
 #endif
 
+#if !KMP_USE_ABT
   /* Add the syncronizing data which is cache aligned and padded. */
   KMP_ALIGN_CACHE kmp_balign_t th_bar[bs_last_barrier];
+#endif
 
   KMP_ALIGN_CACHE volatile kmp_int32
       th_next_waiting; /* gtid+1 of next thread on lock wait queue, 0 if none */
@@ -2602,7 +2730,11 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   // Synchronization Data
   // ---------------------------------------------------------------------------
   KMP_ALIGN_CACHE kmp_ordered_team_t t_ordered;
+#if KMP_USE_ABT
+  kmp_barrier_t t_team_bar;
+#else
   kmp_balign_team_t t_bar[bs_last_barrier];
+#endif
   std::atomic<int> t_construct; // count of single directive encountered by team
   char pad[sizeof(kmp_lock_t)]; // padding to maintain performance on big iron
 
@@ -2654,7 +2786,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   int t_level; // nested parallel level
 
   KMP_ALIGN_CACHE int t_max_argc;
-  int t_max_nproc; // max threads this team can handle (dynamicly expandable)
+  int t_max_nproc; // max threads this team can handle (dynamically expandable)
   int t_serialized; // levels deep of serialized teams
   dispatch_shared_info_t *t_disp_buffer; // buffers for dispatch system
   int t_id; // team's id, assigned by debugger.
@@ -2669,6 +2801,14 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   // omp_set_num_threads() call
   omp_allocator_handle_t t_def_allocator; /* default allocator */
 
+#if KMP_USE_ABT && KMP_BARRIER_ICV_PUSH
+  KMP_ALIGN_CACHE
+  kmp_internal_control_t t_master_icvs; // master's icvs
+  // both are updated in __kmp_abt_xxx_workers
+  int t_master_place_id; // Inform child threads of the affinity information.
+  kmp_proc_bind_t t_proc_bind_applied;
+#endif
+
 // Read/write by workers as well
 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
   // Using CACHE_LINE=64 reduces memory footprint, but causes a big perf
@@ -2723,13 +2863,9 @@ typedef union KMP_ALIGN_CACHE kmp_global {
 } kmp_global_t;
 
 typedef struct kmp_base_root {
-  // TODO: GEH - combine r_active with r_in_parallel then r_active ==
-  // (r_in_parallel>= 0)
   // TODO: GEH - then replace r_active with t_active_levels if we can to reduce
   // the synch overhead or keeping r_active
   volatile int r_active; /* TRUE if some region in a nest has > 1 thread */
-  // keeps a count of active parallel regions per root
-  std::atomic<int> r_in_parallel;
   // GEH: This is misnamed, should be r_active_levels
   kmp_team_t *r_root_team;
   kmp_team_t *r_hot_team;
@@ -2751,6 +2887,10 @@ struct fortran_inx_info {
 
 /* ------------------------------------------------------------------------ */
 
+#if KMP_USE_ABT
+extern volatile int __kmp_abt_init_global;
+#endif
+
 extern int __kmp_settings;
 extern int __kmp_duplicate_library_ok;
 #if USE_ITT_BUILD
@@ -2908,6 +3048,10 @@ extern int __kmp_max_nth;
 // maximum total number of concurrently-existing threads in a contention group
 extern int __kmp_cg_max_nth;
 extern int __kmp_teams_max_nth; // max threads used in a teams construct
+#if KMP_REMOVE_FORKJOIN_LOCK
+/* __kmp_threads_capacity must be protected by __kmp_threads_lock */
+/* write: lock  read: anytime */
+#endif
 extern int __kmp_threads_capacity; /* capacity of the arrays __kmp_threads and
                                       __kmp_root */
 extern int __kmp_dflt_team_nth; /* default number of threads in a parallel
@@ -3019,6 +3163,7 @@ extern int __kmp_omp_cancellation; /* TRUE or FALSE */
 
 /* ------------------------------------------------------------------------- */
 
+#if !KMP_REMOVE_FORKJOIN_LOCK
 /* the following are protected by the fork/join lock */
 /* write: lock  read: anytime */
 extern kmp_info_t **__kmp_threads; /* Descriptors for the threads */
@@ -3036,6 +3181,30 @@ extern std::atomic<int> __kmp_thread_pool_active_nth;
 
 extern kmp_root_t **__kmp_root; /* root of thread hierarchy */
 /* end data protected by fork/join lock */
+#else
+/* write: lock  read: anytime */
+extern kmp_info_t **__kmp_threads; /* Descriptors for the threads */
+extern kmp_bootstrap_lock_t __kmp_threads_lock;
+/* read/write: lock */
+extern volatile kmp_team_t *__kmp_team_pool;
+extern kmp_bootstrap_lock_t __kmp_team_pool_lock;
+/* read/write: lock */
+extern volatile kmp_info_t *__kmp_thread_pool;
+extern kmp_info_t *__kmp_thread_pool_insert_pt;
+extern kmp_bootstrap_lock_t __kmp_thread_pool_lock;
+
+// Must be updated atomically
+// total num threads reachable from some root thread including all root threads
+extern volatile int __kmp_nth;
+/* total number of threads reachable from some root thread including all root
+   threads, and those in the thread pool */
+extern volatile int __kmp_all_nth;
+extern int __kmp_thread_pool_nth;
+extern std::atomic<int> __kmp_thread_pool_active_nth;
+
+extern kmp_root_t **__kmp_root; /* root of thread hierarchy */
+#endif
+
 /* ------------------------------------------------------------------------- */
 
 #define __kmp_get_gtid() __kmp_get_global_thread_id()
@@ -3082,6 +3251,18 @@ static inline kmp_team_t *__kmp_team_from_gtid(int gtid) {
   return __kmp_threads[gtid]->th.th_team;
 }
 
+static inline void __kmp_assert_valid_gtid(kmp_int32 gtid) {
+  if (UNLIKELY(gtid < 0 || gtid >= __kmp_threads_capacity))
+    KMP_FATAL(ThreadIdentInvalid);
+}
+
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+extern int __kmp_user_level_mwait; // TRUE or FALSE; from KMP_USER_LEVEL_MWAIT
+extern int __kmp_umwait_enabled; // Runtime check if user-level mwait enabled
+extern int __kmp_mwait_enabled; // Runtime check if ring3 mwait is enabled
+extern int __kmp_mwait_hints; // Hints to pass in to mwait
+#endif
+
 /* ------------------------------------------------------------------------- */
 
 extern kmp_global_t __kmp_global; /* global status */
@@ -3115,12 +3296,12 @@ extern void __kmp_internal_begin(void);
 extern void __kmp_internal_end_library(int gtid);
 extern void __kmp_internal_end_thread(int gtid);
 extern void __kmp_internal_end_atexit(void);
-extern void __kmp_internal_end_fini(void);
 extern void __kmp_internal_end_dtor(void);
 extern void __kmp_internal_end_dest(void *);
 
 extern int __kmp_register_root(int initial_thread);
 extern void __kmp_unregister_root(int gtid);
+extern void __kmp_unregister_library(void); // called by __kmp_internal_end()
 
 extern int __kmp_ignore_mppbeg(void);
 extern int __kmp_ignore_mppend(void);
@@ -3284,17 +3465,14 @@ extern kmp_uint32 __kmp_wait_4(kmp_uint32 volatile *spinner, kmp_uint32 checker,
 extern void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
                              kmp_uint32 (*pred)(void *, kmp_uint32), void *obj);
 
-class kmp_flag_32;
-class kmp_flag_64;
-class kmp_flag_oncore;
-extern void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64 *flag,
+extern void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64<> *flag,
                           int final_spin
 #if USE_ITT_BUILD
                           ,
                           void *itt_sync_obj
 #endif
                           );
-extern void __kmp_release_64(kmp_flag_64 *flag);
+extern void __kmp_release_64(kmp_flag_64<> *flag);
 
 extern void __kmp_infinite_loop(void);
 
@@ -3372,9 +3550,15 @@ extern int __kmp_read_system_info(struct kmp_sys_info *info);
 extern void __kmp_create_monitor(kmp_info_t *th);
 #endif
 
+#if !KMP_USE_ABT
 extern void *__kmp_launch_thread(kmp_info_t *thr);
+#endif
 
+#if !KMP_USE_ABT
 extern void __kmp_create_worker(int gtid, kmp_info_t *th, size_t stack_size);
+#else
+extern void __kmp_abt_create_workers(kmp_team_t *team);
+#endif
 
 #if KMP_OS_WINDOWS
 extern int __kmp_still_running(kmp_info_t *th);
@@ -3392,13 +3576,6 @@ extern int __kmp_try_suspend_mx(kmp_info_t *th);
 extern void __kmp_lock_suspend_mx(kmp_info_t *th);
 extern void __kmp_unlock_suspend_mx(kmp_info_t *th);
 
-extern void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag);
-extern void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag);
-extern void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag);
-extern void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag);
-extern void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag);
-extern void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag);
-
 extern void __kmp_elapsed(double *);
 extern void __kmp_elapsed_tick(double *);
 
@@ -3464,13 +3641,7 @@ enum fork_context_e {
 extern int __kmp_fork_call(ident_t *loc, int gtid,
                            enum fork_context_e fork_context, kmp_int32 argc,
                            microtask_t microtask, launch_t invoker,
-/* TODO: revert workaround for Intel(R) 64 tracker #96 */
-#if (KMP_ARCH_ARM || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64) && KMP_OS_LINUX
-                           va_list *ap
-#else
-                           va_list ap
-#endif
-                           );
+                           kmp_va_list ap);
 
 extern void __kmp_join_call(ident_t *loc, int gtid
 #if OMPT_SUPPORT
@@ -3502,7 +3673,7 @@ extern void __kmp_user_set_library(enum library_type arg);
 extern void __kmp_aux_set_library(enum library_type arg);
 extern void __kmp_aux_set_stacksize(size_t arg);
 extern void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid);
-extern void __kmp_aux_set_defaults(char const *str, int len);
+extern void __kmp_aux_set_defaults(char const *str, size_t len);
 
 /* Functions called from __kmp_aux_env_initialize() in kmp_settings.cpp */
 void kmpc_set_blocktime(int arg);
@@ -3529,28 +3700,6 @@ extern kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref,
                                                        kmp_task_t *task);
 extern void __kmp_fulfill_event(kmp_event_t *event);
 
-int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid,
-                           kmp_flag_32 *flag, int final_spin,
-                           int *thread_finished,
-#if USE_ITT_BUILD
-                           void *itt_sync_obj,
-#endif /* USE_ITT_BUILD */
-                           kmp_int32 is_constrained);
-int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid,
-                           kmp_flag_64 *flag, int final_spin,
-                           int *thread_finished,
-#if USE_ITT_BUILD
-                           void *itt_sync_obj,
-#endif /* USE_ITT_BUILD */
-                           kmp_int32 is_constrained);
-int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid,
-                               kmp_flag_oncore *flag, int final_spin,
-                               int *thread_finished,
-#if USE_ITT_BUILD
-                               void *itt_sync_obj,
-#endif /* USE_ITT_BUILD */
-                               kmp_int32 is_constrained);
-
 extern void __kmp_free_task_team(kmp_info_t *thread,
                                  kmp_task_team_t *task_team);
 extern void __kmp_reap_task_teams(void);
@@ -3575,6 +3724,25 @@ extern kmp_uint64 __kmp_hardware_timestamp(void);
 extern int __kmp_read_from_file(char const *path, char const *format, ...);
 #endif
 
+#if KMP_USE_ABT
+extern void __kmp_abt_global_initialize(void);
+extern void __kmp_abt_global_destroy(void);
+extern void __kmp_abt_create_uber(int gtid, kmp_info_t *th, size_t stack_size);
+extern void __kmp_abt_join_workers(kmp_team_t *team);
+extern int __kmp_abt_create_task(kmp_info_t *th, kmp_task_t *task);
+extern kmp_info_t *__kmp_abt_wait_child_tasks(kmp_info_t *th, bool thread_bind,
+                                              int yield);
+extern kmp_info_t *__kmp_abt_bind_task_to_thread(kmp_team_t *team,
+                                                 kmp_taskdata_t *taskdata);
+extern void __kmp_abt_set_self_info(kmp_info_t *th);
+extern kmp_info_t *__kmp_abt_get_self_info(void);
+extern void __kmp_abt_release_info(kmp_info_t *th);
+extern void __kmp_abt_acquire_info_for_task(kmp_info_t *th,
+                                            kmp_taskdata_t *taskdata,
+                                            const kmp_team_t *match_team,
+                                            int atomic = 1);
+#endif
+
 /* ------------------------------------------------------------------------ */
 //
 // Assembly routines that have no compiler intrinsic replacement
@@ -3717,6 +3885,12 @@ KMP_EXPORT void __kmpc_taskloop(ident_t *loc, kmp_int32 gtid, kmp_task_t *task,
                                 kmp_uint64 *ub, kmp_int64 st, kmp_int32 nogroup,
                                 kmp_int32 sched, kmp_uint64 grainsize,
                                 void *task_dup);
+KMP_EXPORT void __kmpc_taskloop_5(ident_t *loc, kmp_int32 gtid,
+                                  kmp_task_t *task, kmp_int32 if_val,
+                                  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                                  kmp_int32 nogroup, kmp_int32 sched,
+                                  kmp_uint64 grainsize, kmp_int32 modifier,
+                                  void *task_dup);
 KMP_EXPORT void *__kmpc_task_reduction_init(int gtid, int num_data, void *data);
 KMP_EXPORT void *__kmpc_taskred_init(int gtid, int num_data, void *data);
 KMP_EXPORT void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *d);
@@ -3881,7 +4055,6 @@ extern int __kmpc_get_target_offload();
 
 // Constants used in libomptarget
 #define KMP_DEVICE_DEFAULT -1 // This is libomptarget's default device.
-#define KMP_HOST_DEVICE -10 // This is what it is in libomptarget, go figure.
 #define KMP_DEVICE_ALL -11 // This is libomptarget's "all devices".
 
 // OMP Pause Resource
@@ -3909,8 +4082,219 @@ static inline void __kmp_resume_if_hard_paused() {
   }
 }
 
+extern void __kmp_omp_display_env(int verbose);
+
 #ifdef __cplusplus
 }
 #endif
 
+template <bool C, bool S>
+extern void __kmp_suspend_32(int th_gtid, kmp_flag_32<C, S> *flag);
+template <bool C, bool S>
+extern void __kmp_suspend_64(int th_gtid, kmp_flag_64<C, S> *flag);
+extern void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag);
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+template <bool C, bool S>
+extern void __kmp_mwait_32(int th_gtid, kmp_flag_32<C, S> *flag);
+template <bool C, bool S>
+extern void __kmp_mwait_64(int th_gtid, kmp_flag_64<C, S> *flag);
+extern void __kmp_mwait_oncore(int th_gtid, kmp_flag_oncore *flag);
+#endif
+template <bool C, bool S>
+extern void __kmp_resume_32(int target_gtid, kmp_flag_32<C, S> *flag);
+template <bool C, bool S>
+extern void __kmp_resume_64(int target_gtid, kmp_flag_64<C, S> *flag);
+extern void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag);
+
+template <bool C, bool S>
+int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid,
+                           kmp_flag_32<C, S> *flag, int final_spin,
+                           int *thread_finished,
+#if USE_ITT_BUILD
+                           void *itt_sync_obj,
+#endif /* USE_ITT_BUILD */
+                           kmp_int32 is_constrained);
+template <bool C, bool S>
+int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid,
+                           kmp_flag_64<C, S> *flag, int final_spin,
+                           int *thread_finished,
+#if USE_ITT_BUILD
+                           void *itt_sync_obj,
+#endif /* USE_ITT_BUILD */
+                           kmp_int32 is_constrained);
+int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid,
+                               kmp_flag_oncore *flag, int final_spin,
+                               int *thread_finished,
+#if USE_ITT_BUILD
+                               void *itt_sync_obj,
+#endif /* USE_ITT_BUILD */
+                               kmp_int32 is_constrained);
+
+/// This class safely opens and closes a C-style FILE* object using RAII
+/// semantics. There are also methods which allow using stdout or stderr as
+/// the underlying FILE* object. With the implicit conversion operator to
+/// FILE*, an object with this type can be used in any function which takes
+/// a FILE* object e.g., fprintf().
+/// No close method is needed at use sites.
+class kmp_safe_raii_file_t {
+  FILE *f;
+
+  void close() {
+    if (f && f != stdout && f != stderr) {
+      fclose(f);
+      f = nullptr;
+    }
+  }
+
+public:
+  kmp_safe_raii_file_t() : f(nullptr) {}
+  kmp_safe_raii_file_t(const char *filename, const char *mode,
+                       const char *env_var = nullptr)
+      : f(nullptr) {
+    open(filename, mode, env_var);
+  }
+  ~kmp_safe_raii_file_t() { close(); }
+
+  /// Open filename using mode. This is automatically closed in the destructor.
+  /// The env_var parameter indicates the environment variable the filename
+  /// came from if != nullptr.
+  void open(const char *filename, const char *mode,
+            const char *env_var = nullptr) {
+    KMP_ASSERT(!f);
+    f = fopen(filename, mode);
+    if (!f) {
+      int code = errno;
+      if (env_var) {
+        __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code),
+                    KMP_HNT(CheckEnvVar, env_var, filename), __kmp_msg_null);
+      } else {
+        __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code),
+                    __kmp_msg_null);
+      }
+    }
+  }
+  /// Set the FILE* object to stdout and output there
+  /// No open call should happen before this call.
+  void set_stdout() {
+    KMP_ASSERT(!f);
+    f = stdout;
+  }
+  /// Set the FILE* object to stderr and output there
+  /// No open call should happen before this call.
+  void set_stderr() {
+    KMP_ASSERT(!f);
+    f = stderr;
+  }
+  operator bool() { return bool(f); }
+  operator FILE *() { return f; }
+};
+
+template <typename SourceType, typename TargetType,
+          bool isSourceSmaller = (sizeof(SourceType) < sizeof(TargetType)),
+          bool isSourceEqual = (sizeof(SourceType) == sizeof(TargetType)),
+          bool isSourceSigned = std::is_signed<SourceType>::value,
+          bool isTargetSigned = std::is_signed<TargetType>::value>
+struct kmp_convert {};
+
+// Both types are signed; Source smaller
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, true, false, true, true> {
+  static TargetType to(SourceType src) { return (TargetType)src; }
+};
+// Source equal
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, false, true, true, true> {
+  static TargetType to(SourceType src) { return src; }
+};
+// Source bigger
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, false, false, true, true> {
+  static TargetType to(SourceType src) {
+    KMP_ASSERT(src <= static_cast<SourceType>(
+                          (std::numeric_limits<TargetType>::max)()));
+    KMP_ASSERT(src >= static_cast<SourceType>(
+                          (std::numeric_limits<TargetType>::min)()));
+    return (TargetType)src;
+  }
+};
+
+// Source signed, Target unsigned
+// Source smaller
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, true, false, true, false> {
+  static TargetType to(SourceType src) {
+    KMP_ASSERT(src >= 0);
+    return (TargetType)src;
+  }
+};
+// Source equal
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, false, true, true, false> {
+  static TargetType to(SourceType src) {
+    KMP_ASSERT(src >= 0);
+    return (TargetType)src;
+  }
+};
+// Source bigger
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, false, false, true, false> {
+  static TargetType to(SourceType src) {
+    KMP_ASSERT(src >= 0);
+    KMP_ASSERT(src <= static_cast<SourceType>(
+                          (std::numeric_limits<TargetType>::max)()));
+    return (TargetType)src;
+  }
+};
+
+// Source unsigned, Target signed
+// Source smaller
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, true, false, false, true> {
+  static TargetType to(SourceType src) { return (TargetType)src; }
+};
+// Source equal
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, false, true, false, true> {
+  static TargetType to(SourceType src) {
+    KMP_ASSERT(src <= static_cast<SourceType>(
+                          (std::numeric_limits<TargetType>::max)()));
+    return (TargetType)src;
+  }
+};
+// Source bigger
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, false, false, false, true> {
+  static TargetType to(SourceType src) {
+    KMP_ASSERT(src <= static_cast<SourceType>(
+                          (std::numeric_limits<TargetType>::max)()));
+    return (TargetType)src;
+  }
+};
+
+// Source unsigned, Target unsigned
+// Source smaller
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, true, false, false, false> {
+  static TargetType to(SourceType src) { return (TargetType)src; }
+};
+// Source equal
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, false, true, false, false> {
+  static TargetType to(SourceType src) { return src; }
+};
+// Source bigger
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, false, false, false, false> {
+  static TargetType to(SourceType src) {
+    KMP_ASSERT(src <= static_cast<SourceType>(
+                          (std::numeric_limits<TargetType>::max)()));
+    return (TargetType)src;
+  }
+};
+
+template <typename T1, typename T2>
+static inline void __kmp_type_convert(T1 src, T2 *dest) {
+  *dest = kmp_convert<T1, T2>::to(src);
+}
+
 #endif /* KMP_H */
diff --git a/runtime/src/kmp_abt.h b/runtime/src/kmp_abt.h
new file mode 100644
index 000000000..56804a057
--- /dev/null
+++ b/runtime/src/kmp_abt.h
@@ -0,0 +1,96 @@
+/*
+ * kmp_abt.h -- header file.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_ABT_H
+#define KMP_ABT_H
+
+#if KMP_USE_ABT
+
+#include <abt.h>
+
+#define KMP_ABT_FORK_NUM_WAYS_DEFAULT 2
+#define KMP_ABT_FORK_CUTOFF_DEFAULT (1 << 20)
+#define KMP_ABT_SCHED_SLEEP_DEFAULT 0
+#define KMP_ABT_SCHED_MIN_SLEEP_NSEC_DEFAULT 1 /* 1ns */
+#define KMP_ABT_SCHED_MAX_SLEEP_NSEC_DEFAULT 1048576 /* 1ms */
+#define KMP_ABT_SCHED_EVENT_FREQ_DEFAULT 256 /* Every 100 work-stealing loops */
+#define KMP_ABT_SCHED_EVENT_FREQ_MAX 1048576 /* Needed to avoid deadlock */
+#define KMP_ABT_WORK_STEAL_FREQ_DEFAULT 65536 /* Every 65536 loops */
+
+static inline uint32_t __kmp_abt_fast_rand32(uint32_t *p_seed) {
+  // George Marsaglia, "Xorshift RNGs", Journal of Statistical Software,
+  // Articles, 2003
+  uint32_t seed = *p_seed;
+  seed ^= seed << 13;
+  seed ^= seed >> 17;
+  seed ^= seed << 5;
+  *p_seed = seed;
+  return seed;
+}
+
+// ES-local data.
+typedef struct kmp_abt_local {
+  /* ------------------------------------------------------------------------ */
+  // Mostly read only
+
+  ABT_xstream xstream;
+  ABT_sched sched;
+
+  // Scheduler
+  ABT_pool shared_pool;
+  int place_id;
+  ABT_pool place_pool;
+  /* ------------------------------------------------------------------------ */
+} __attribute__((aligned(CACHE_LINE))) kmp_abt_local_t;
+
+// Global data.
+typedef struct kmp_abt_global {
+  /* ------------------------------------------------------------------------ */
+  // Mostly read only
+  int num_xstreams;
+  int fork_num_ways;
+  int fork_cutoff;
+  int is_sched_sleep;
+  int sched_sleep_min_nsec;
+  int sched_sleep_max_nsec;
+  int sched_event_freq;
+  uint32_t work_steal_freq;
+  int num_places;
+  ABT_pool *place_pools;
+
+  // ES-local data.
+  kmp_abt_local *locals;
+} __attribute__((aligned(CACHE_LINE))) kmp_abt_global_t;
+
+extern kmp_abt_global_t __kmp_abt_global;
+
+typedef struct kmp_abt_affinity_place {
+  size_t num_ranks;
+  int *ranks;
+} kmp_abt_affinity_place_t;
+
+typedef struct kmp_abt_affinity_places {
+  size_t num_places;
+  kmp_abt_affinity_place_t **p_places;
+} kmp_abt_affinity_places_t;
+
+extern kmp_abt_affinity_places_t *__kmp_abt_parse_affinity(int num_xstreams,
+                                                           const char *str,
+                                                           size_t len,
+                                                           bool verbose);
+extern int __kmp_abt_affinity_place_find
+    (const kmp_abt_affinity_place_t *p_place, int rank);
+extern void __kmp_abt_affinity_places_free(kmp_abt_affinity_places_t *p_places);
+
+#endif // KMP_USE_ABT
+#endif // KMP_ABT_H
diff --git a/runtime/src/kmp_abt_affinity.cpp b/runtime/src/kmp_abt_affinity.cpp
new file mode 100644
index 000000000..17f621bb8
--- /dev/null
+++ b/runtime/src/kmp_abt_affinity.cpp
@@ -0,0 +1,869 @@
+
+/*
+ * kmp_abt_affinity.cpp -- affinity parser for BOLT
+ */
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <memory.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+typedef struct kmp_abt_affinity_place {
+  size_t num_ranks;
+  int *ranks;
+} kmp_abt_affinity_place_t;
+
+typedef struct kmp_abt_affinity_places {
+  size_t num_places;
+  kmp_abt_affinity_place_t **p_places;
+} kmp_abt_affinity_places_t;
+
+typedef enum parse_pinterval {
+  pinterval_pls,
+  pinterval_pl,
+  pinterval_p,
+  pinterval_ip,
+} parse_pinterval_t;
+
+typedef enum parse_res_interval {
+  res_interval_rns,
+  res_interval_rn,
+  res_interval_r,
+  res_interval_ir,
+} parse_res_interval_t;
+
+typedef enum parse_word {
+  word_sockets,
+  word_cores,
+  word_threads,
+} parse_word_t;
+
+typedef enum parse_num {
+  num_any,
+  num_positive,
+  num_nonnegative,
+} parse_num_t;
+
+static kmp_abt_affinity_place_t *__kmp_abt_affinity_place_create() {
+  kmp_abt_affinity_place_t *p_new_place = (kmp_abt_affinity_place_t *)
+      malloc(sizeof(kmp_abt_affinity_place_t));
+  p_new_place->num_ranks = 0;
+  p_new_place->ranks = NULL;
+  return p_new_place;
+}
+
+static kmp_abt_affinity_place_t *__kmp_abt_affinity_place_create_rank
+    (int rank) {
+  kmp_abt_affinity_place_t *p_new_place = __kmp_abt_affinity_place_create();
+  p_new_place->num_ranks = 1;
+  p_new_place->ranks = (int *)malloc(sizeof(int) * 1);
+  p_new_place->ranks[0] = rank;
+  return p_new_place;
+}
+
+static kmp_abt_affinity_place_t *__kmp_abt_affinity_place_create_place
+    (const kmp_abt_affinity_place_t *p_place) {
+  kmp_abt_affinity_place_t *p_new_place = __kmp_abt_affinity_place_create();
+  size_t num_ranks = p_place->num_ranks;
+  p_new_place->num_ranks = num_ranks;
+  p_new_place->ranks = (int *)malloc(sizeof(int) * num_ranks);
+  memcpy(p_new_place->ranks, p_place->ranks, sizeof(int) * num_ranks);
+  return p_new_place;
+}
+
+static void __kmp_abt_affinity_place_free(kmp_abt_affinity_place_t *p_place) {
+  free(p_place->ranks);
+  free(p_place);
+}
+
+int __kmp_abt_affinity_place_find(const kmp_abt_affinity_place_t *p_place,
+                                  int rank) {
+  for (size_t i = 0, num_ranks = p_place->num_ranks; i < num_ranks; i++)
+    if (p_place->ranks[i] == rank)
+      return 1;
+  return 0;
+}
+
+static void __kmp_abt_affinity_place_insert(kmp_abt_affinity_place_t *p_place,
+                                            int rank) {
+  if (__kmp_abt_affinity_place_find(p_place, rank))
+    return;
+  size_t num_ranks = p_place->num_ranks;
+  size_t new_num_ranks = num_ranks + 1;
+  int *new_ranks = (int *)malloc(sizeof(int) * new_num_ranks);
+  memcpy(new_ranks, p_place->ranks, sizeof(int) * num_ranks);
+  free(p_place->ranks);
+  new_ranks[num_ranks] = rank;
+  p_place->ranks = new_ranks;
+  p_place->num_ranks = new_num_ranks;
+}
+
+static void __kmp_abt_affinity_place_insert_place
+    (kmp_abt_affinity_place_t *p_place,
+     const kmp_abt_affinity_place_t *p_inserted) {
+  for (int i = 0, num_ranks = p_inserted->num_ranks; i < num_ranks; i++)
+    __kmp_abt_affinity_place_insert(p_place, p_inserted->ranks[i]);
+}
+
+static kmp_abt_affinity_places_t *__kmp_abt_affinity_places_create() {
+  kmp_abt_affinity_places_t *p_new_places = (kmp_abt_affinity_places_t *)
+      malloc(sizeof(kmp_abt_affinity_places_t));
+  p_new_places->num_places = 0;
+  p_new_places->p_places = NULL;
+  return p_new_places;
+}
+
+void __kmp_abt_affinity_places_free(kmp_abt_affinity_places_t *p_places) {
+  for (size_t i = 0; i < p_places->num_places; i++)
+    __kmp_abt_affinity_place_free(p_places->p_places[i]);
+  free(p_places->p_places);
+  free(p_places);
+}
+
+static void __kmp_abt_affinity_places_add(kmp_abt_affinity_places_t *p_places,
+                                          kmp_abt_affinity_place_t *p_place) {
+  size_t num_places = p_places->num_places;
+  size_t new_num_places = num_places + 1;
+  kmp_abt_affinity_place_t **p_new_places = (kmp_abt_affinity_place_t **)
+      malloc(sizeof(kmp_abt_affinity_place_t *) * new_num_places);
+  memcpy(p_new_places, p_places->p_places, sizeof(kmp_abt_affinity_place_t *)
+                                           * num_places);
+  free(p_places->p_places);
+  p_new_places[num_places] = p_place;
+  p_places->p_places = p_new_places;
+  p_places->num_places = new_num_places;
+}
+
+static bool __kmp_abt_parse_num(int *p_val, parse_num_t type,
+                                const char *str, size_t len,
+                                size_t *p_consume) {
+  size_t consume = 0;
+  int sign = 1;
+  int val = 0;
+  if (len == consume)
+    return false;
+  while (1) {
+    if (len == consume)
+      return false;
+    if (str[consume] == '-') {
+      consume++; // Consume "-"
+      sign *= -1;
+    } else if (str[consume] == '+') {
+      consume++; // Consume "+"
+      sign *= 1;
+    } else {
+      break;
+    }
+  }
+  if (len == consume)
+    return false;
+  if (str[consume] < '0' || '9' < str[consume])
+    return false;
+  val = int(str[consume] - '0');
+  consume++; // Consume a digit.
+  while (1) {
+    if (len == consume || str[consume] < '0' || '9' < str[consume]) {
+      int ret_val = sign * val;
+      if (type == num_positive && ret_val <= 0) {
+        return false;
+      } else if (type == num_nonnegative && ret_val < 0) {
+        return false;
+      }
+      *p_consume += consume;
+      *p_val += sign * val;
+      return true;
+    }
+    val = val * 10 + int(str[consume] - '0');
+    consume++; // Consume a digit.
+  }
+  // Unreachable.
+}
+
+static inline int __kmp_abt_parse_mod_place(int val, int num_xstreams) {
+  return ((val % num_xstreams) + num_xstreams) % num_xstreams;
+}
+
+typedef struct kmp_abt_parse_res_interval {
+  parse_res_interval_t type;
+  int res;
+  int num_places;
+  int stride;
+} kmp_abt_parse_res_interval_t;
+
+static kmp_abt_parse_res_interval_t *__kmp_abt_parse_res_interval_create
+    (parse_res_interval_t type, unsigned int res, unsigned int num_places,
+     int stride) {
+  kmp_abt_parse_res_interval_t *p_parse_res_interval =
+      (kmp_abt_parse_res_interval_t *)
+      malloc(sizeof(kmp_abt_parse_res_interval_t));
+  p_parse_res_interval->type = type;
+  p_parse_res_interval->res = res;
+  p_parse_res_interval->num_places = num_places;
+  p_parse_res_interval->stride = stride;
+  return p_parse_res_interval;
+  }
+
+static kmp_abt_parse_res_interval_t *__kmp_abt_parse_res_interval_create_rns
+    (unsigned int res, unsigned int num_places, int stride) {
+  return __kmp_abt_parse_res_interval_create(res_interval_rns, res, num_places,
+                                             stride);
+}
+
+static kmp_abt_parse_res_interval_t *__kmp_abt_parse_res_interval_create_rn
+    (unsigned int res, unsigned int num_places) {
+  return __kmp_abt_parse_res_interval_create(res_interval_rn, res, num_places,
+                                             0);
+}
+
+static kmp_abt_parse_res_interval_t *__kmp_abt_parse_res_interval_create_ir
+    (unsigned int res) {
+  return __kmp_abt_parse_res_interval_create(res_interval_ir, res, 0, 0);
+}
+
+static kmp_abt_parse_res_interval_t *__kmp_abt_parse_res_interval_create_r
+    (unsigned int res) {
+  return __kmp_abt_parse_res_interval_create(res_interval_r, res, 0, 0);
+}
+
+static kmp_abt_parse_res_interval_t *__kmp_abt_parse_res_interval_parse
+    (const char *str, size_t len, size_t *p_consume) {
+  bool invert = false;
+  size_t consume = 0;
+  if (len != consume && str[consume] == '!') {
+    invert = true;
+    consume++; // Consume "!"
+  }
+  int res = 0;
+  if (!__kmp_abt_parse_num(&res, num_nonnegative, str + consume, len - consume,
+                           &consume))
+    return NULL;
+  if (!invert && len != consume && str[consume] == ':') {
+    consume++; // Consume ":"
+    int num_places = 0;
+    if (!__kmp_abt_parse_num(&num_places, num_positive, str + consume,
+                             len - consume, &consume))
+      return NULL;
+    if (len != consume && str[consume] == ':') {
+      consume++; // Consume ":"
+      int stride = 0;
+      if (!__kmp_abt_parse_num(&stride, num_any, str + consume, len - consume,
+                               &consume))
+        return NULL;
+      *p_consume += consume;
+      return __kmp_abt_parse_res_interval_create_rns(res, num_places, stride);
+    } else {
+      *p_consume += consume;
+      return __kmp_abt_parse_res_interval_create_rn(res, num_places);
+    }
+  } else {
+    *p_consume += consume;
+    if (invert) {
+      return __kmp_abt_parse_res_interval_create_ir(res);
+    } else {
+      return __kmp_abt_parse_res_interval_create_r(res);
+    }
+  }
+}
+
+static void __kmp_abt_parse_res_interval_free
+    (kmp_abt_parse_res_interval_t *p_res_interval) {
+  free(p_res_interval);
+}
+
+static kmp_abt_affinity_place_t *__kmp_abt_parse_res_interval_generate_place
+    (const kmp_abt_parse_res_interval_t *p_res_interval, int num_xstreams) {
+  kmp_abt_affinity_place_t *p_place = __kmp_abt_affinity_place_create();
+  if (p_res_interval->type == res_interval_rns) {
+    for (int i = 0; i < p_res_interval->num_places; i++)
+      __kmp_abt_affinity_place_insert(p_place,
+          __kmp_abt_parse_mod_place(p_res_interval->res + i
+                                    * p_res_interval->stride, num_xstreams));
+  } else if (p_res_interval->type == res_interval_rn) {
+    for (int i = 0; i < p_res_interval->num_places; i++)
+      __kmp_abt_affinity_place_insert(p_place,
+          __kmp_abt_parse_mod_place(p_res_interval->res + i, num_xstreams));
+  } else if (p_res_interval->type == res_interval_r) {
+    __kmp_abt_affinity_place_insert(p_place,
+        __kmp_abt_parse_mod_place(p_res_interval->res, num_xstreams));
+  } else {
+    for (int i = 0; i < num_xstreams; i++) {
+      if (i != __kmp_abt_parse_mod_place(p_res_interval->res, num_xstreams))
+        __kmp_abt_affinity_place_insert(p_place, i);
+    }
+  }
+  return p_place;
+}
+
+static void __kmp_abt_parse_res_interval_print
+    (const kmp_abt_parse_res_interval_t *p_res_interval) {
+  if (p_res_interval->type == res_interval_rns) {
+    printf("%d:%d:%d", (int)p_res_interval->res,
+           (int)p_res_interval->num_places, (int)p_res_interval->stride);
+  } else if (p_res_interval->type == res_interval_rn) {
+    printf("%d:%d", (int)p_res_interval->res, (int)p_res_interval->num_places);
+  } else if (p_res_interval->type == res_interval_r) {
+    printf("%d", (int)p_res_interval->res);
+  } else {
+    printf("!%d", (int)p_res_interval->res);
+  }
+}
+
+typedef struct kmp_abt_parse_res_list {
+  size_t len_res_intervals;
+  kmp_abt_parse_res_interval_t **p_res_intervals;
+} kmp_abt_parse_res_list_t;
+
+static kmp_abt_parse_res_list_t *__kmp_abt_parse_res_list_create() {
+  kmp_abt_parse_res_list_t *p_res_list = (kmp_abt_parse_res_list_t *)
+      malloc(sizeof(kmp_abt_parse_res_list_t));
+  p_res_list->len_res_intervals = 0;
+  p_res_list->p_res_intervals = NULL;
+  return p_res_list;
+}
+
+static void __kmp_abt_parse_res_list_push_back
+    (kmp_abt_parse_res_list_t *p_res_list,
+     kmp_abt_parse_res_interval_t *p_res_interval) {
+  size_t len_res_intervals = p_res_list->len_res_intervals;
+  size_t new_len_res_intervals = len_res_intervals + 1;
+  kmp_abt_parse_res_interval_t **p_new_res_intervals
+      = (kmp_abt_parse_res_interval_t **)
+        malloc(sizeof(kmp_abt_parse_res_interval_t *) * new_len_res_intervals);
+  memcpy(p_new_res_intervals, p_res_list->p_res_intervals,
+         sizeof(kmp_abt_parse_res_interval_t *) * len_res_intervals);
+  free(p_res_list->p_res_intervals);
+  p_new_res_intervals[len_res_intervals] = p_res_interval;
+  p_res_list->len_res_intervals = new_len_res_intervals;
+  p_res_list->p_res_intervals = p_new_res_intervals;
+}
+
+static void __kmp_abt_parse_res_list_free
+    (kmp_abt_parse_res_list_t *p_res_list) {
+  for (size_t i = 0; i < p_res_list->len_res_intervals; i++)
+    __kmp_abt_parse_res_interval_free(p_res_list->p_res_intervals[i]);
+  free(p_res_list->p_res_intervals);
+  free(p_res_list);
+}
+
+static kmp_abt_parse_res_list_t *__kmp_abt_parse_res_list_parse
+    (const char *str, size_t len, size_t *p_consume) {
+  kmp_abt_parse_res_list_t *p_res_list = __kmp_abt_parse_res_list_create();
+  size_t consume = 0;
+  while(1) {
+    kmp_abt_parse_res_interval_t *p_res_interval
+        = __kmp_abt_parse_res_interval_parse(str + consume, len - consume,
+                                             &consume);
+    if (!p_res_interval) {
+      __kmp_abt_parse_res_list_free(p_res_list);
+      return NULL;
+    }
+    __kmp_abt_parse_res_list_push_back(p_res_list, p_res_interval);
+    if (consume == len || str[consume] != ',') {
+      *p_consume += consume;
+      return p_res_list;
+    }
+    consume++; // Consume ","
+  }
+  // Unreachable.
+}
+
+static kmp_abt_affinity_place_t *__kmp_abt_parse_res_list_generate_place
+    (const kmp_abt_parse_res_list_t *p_res_list, int num_xstreams) {
+  kmp_abt_affinity_place_t *p_place = __kmp_abt_affinity_place_create();
+  for (size_t i = 0; i < p_res_list->len_res_intervals; i++) {
+    kmp_abt_parse_res_interval_t *p_res_interval
+        = p_res_list->p_res_intervals[i];
+    kmp_abt_affinity_place_t *p_ret_place
+        = __kmp_abt_parse_res_interval_generate_place(p_res_interval,
+                                                      num_xstreams);
+    __kmp_abt_affinity_place_insert_place(p_place, p_ret_place);
+    __kmp_abt_affinity_place_free(p_ret_place);
+  }
+  return p_place;
+}
+
+static void __kmp_abt_parse_res_list_print
+    (const kmp_abt_parse_res_list_t *p_res_list) {
+  int index = 0;
+  for (size_t i = 0; i < p_res_list->len_res_intervals; i++) {
+    if (index++ != 0)
+      printf(",");
+    __kmp_abt_parse_res_interval_print(p_res_list->p_res_intervals[i]);
+  }
+}
+
+typedef struct kmp_abt_parse_pinterval {
+  parse_pinterval_t type;
+  kmp_abt_parse_res_list_t *p_res_list;
+  int len;
+  int stride;
+} kmp_abt_parse_pinterval_t;
+
+static kmp_abt_parse_pinterval_t *__kmp_abt_parse_pinterval_create
+    (parse_pinterval_t type, kmp_abt_parse_res_list_t *p_res_list, int len,
+     int stride) {
+  kmp_abt_parse_pinterval_t *p_pinterval
+      = (kmp_abt_parse_pinterval_t *)malloc(sizeof(kmp_abt_parse_pinterval_t));
+  p_pinterval->type = type;
+  p_pinterval->p_res_list = p_res_list;
+  p_pinterval->len = len;
+  p_pinterval->stride = stride;
+  return p_pinterval;
+}
+
+static kmp_abt_parse_pinterval_t *__kmp_abt_parse_pinterval_create_pls
+    (kmp_abt_parse_res_list_t *p_res_list, int len, int stride) {
+  return __kmp_abt_parse_pinterval_create(pinterval_pls, p_res_list, len,
+                                          stride);
+}
+
+static kmp_abt_parse_pinterval_t *__kmp_abt_parse_pinterval_create_pl
+    (kmp_abt_parse_res_list_t *p_res_list, int len) {
+  return __kmp_abt_parse_pinterval_create(pinterval_pl, p_res_list, len, 0);
+}
+
+static kmp_abt_parse_pinterval_t *__kmp_abt_parse_pinterval_create_ip
+    (kmp_abt_parse_res_list_t *p_res_list) {
+  return __kmp_abt_parse_pinterval_create(pinterval_ip, p_res_list, 0, 0);
+}
+
+static kmp_abt_parse_pinterval_t *__kmp_abt_parse_pinterval_create_p
+    (kmp_abt_parse_res_list_t *p_res_list) {
+  return __kmp_abt_parse_pinterval_create(pinterval_p, p_res_list, 0, 0);
+}
+
+static void __kmp_abt_parse_pinterval_free
+    (kmp_abt_parse_pinterval_t *p_pinterval) {
+  __kmp_abt_parse_res_list_free(p_pinterval->p_res_list);
+  free(p_pinterval);
+}
+
+static kmp_abt_parse_pinterval_t *__kmp_abt_parse_pinterval_parse
+    (const char *str, size_t len, size_t *p_consume) {
+  bool invert = false;
+  size_t consume = 0;
+  if (len != consume && str[consume] == '!') {
+    invert = true;
+    consume++; // Consume "!"
+  }
+  if (len == consume || str[consume] != '{') {
+    return NULL;
+  } else {
+    consume++; // Consume "{"
+  }
+  kmp_abt_parse_res_list_t *p_res_list
+      = __kmp_abt_parse_res_list_parse(str + consume, len - consume, &consume);
+  if (!p_res_list)
+    return NULL;
+  if (len == consume || str[consume] != '}') {
+    __kmp_abt_parse_res_list_free(p_res_list);
+    return NULL;
+  } else {
+    consume++; // Consume "{"
+  }
+  if (!invert && len != consume && str[consume] == ':') {
+    consume++; // Consume ":"
+    int len_val = 0;
+    if (!__kmp_abt_parse_num(&len_val, num_positive, str + consume,
+                             len - consume, &consume)) {
+      __kmp_abt_parse_res_list_free(p_res_list);
+      return NULL;
+    }
+    if (len != consume && str[consume] == ':') {
+      consume++; // Consume ":"
+      int stride = 0;
+      if (!__kmp_abt_parse_num(&stride, num_any, str + consume, len - consume,
+                               &consume)) {
+        __kmp_abt_parse_res_list_free(p_res_list);
+        return NULL;
+      }
+      *p_consume += consume;
+      return __kmp_abt_parse_pinterval_create_pls(p_res_list, len_val, stride);
+    } else {
+      *p_consume += consume;
+      return __kmp_abt_parse_pinterval_create_pl(p_res_list, len_val);
+    }
+  } else {
+    *p_consume += consume;
+    if (invert) {
+      return __kmp_abt_parse_pinterval_create_ip(p_res_list);
+    } else {
+      return __kmp_abt_parse_pinterval_create_p(p_res_list);
+    }
+  }
+}
+
+static kmp_abt_affinity_places_t *__kmp_abt_parse_pinterval_generate_places
+    (const kmp_abt_parse_pinterval_t *p_pinterval, int num_xstreams) {
+  kmp_abt_affinity_place_t *p_place
+      = __kmp_abt_parse_res_list_generate_place(p_pinterval->p_res_list,
+                                                num_xstreams);
+  kmp_abt_affinity_places_t *p_places = __kmp_abt_affinity_places_create();
+  if (p_pinterval->type == pinterval_pls) {
+    for (int i = 0, len = p_pinterval->len; i < len; i++) {
+      kmp_abt_affinity_place_t *p_tmp_place = __kmp_abt_affinity_place_create();
+      for (size_t j = 0; j != p_place->num_ranks; j++) {
+        int val = p_place->ranks[j];
+        __kmp_abt_affinity_place_insert(p_tmp_place,
+            __kmp_abt_parse_mod_place(val + i * p_pinterval->stride,
+                                      num_xstreams));
+      }
+      __kmp_abt_affinity_places_add(p_places, p_tmp_place);
+    }
+    __kmp_abt_affinity_place_free(p_place);
+  } else if (p_pinterval->type == pinterval_pl) {
+    for (int i = 0, len = p_pinterval->len; i < len; i++) {
+      kmp_abt_affinity_place_t *p_tmp_place = __kmp_abt_affinity_place_create();
+      for (size_t j = 0; j != p_place->num_ranks; j++) {
+        int val = p_place->ranks[j];
+        __kmp_abt_affinity_place_insert(p_tmp_place,
+            __kmp_abt_parse_mod_place(val + i, num_xstreams));
+      }
+      __kmp_abt_affinity_places_add(p_places, p_tmp_place);
+    }
+    __kmp_abt_affinity_place_free(p_place);
+  } else if (p_pinterval->type == pinterval_p) {
+    __kmp_abt_affinity_places_add(p_places, p_place);
+  } else {
+    // Invert.
+    kmp_abt_affinity_place_t *p_tmp_place = __kmp_abt_affinity_place_create();
+    for (int i = 0; i < num_xstreams; i++) {
+      if (!__kmp_abt_affinity_place_find(p_place, i))
+        __kmp_abt_affinity_place_insert(p_tmp_place, i);
+    }
+    __kmp_abt_affinity_places_add(p_places, p_tmp_place);
+    __kmp_abt_affinity_place_free(p_place);
+  }
+  return p_places;
+}
+
+static void __kmp_abt_parse_pinterval_print
+    (const kmp_abt_parse_pinterval_t *p_pinterval) {
+  if (p_pinterval->type == pinterval_pls) {
+    printf("{");
+    __kmp_abt_parse_res_list_print(p_pinterval->p_res_list);
+    printf("}:%d:%d", (int)p_pinterval->len, (int)p_pinterval->stride);
+  } else if (p_pinterval->type == pinterval_pl) {
+    printf("{");
+    __kmp_abt_parse_res_list_print(p_pinterval->p_res_list);
+    printf("}:%d", (int)p_pinterval->len);
+  } else if (p_pinterval->type == pinterval_p) {
+    printf("{");
+    __kmp_abt_parse_res_list_print(p_pinterval->p_res_list);
+    printf("}");
+  } else {
+    printf("!{");
+    __kmp_abt_parse_res_list_print(p_pinterval->p_res_list);
+    printf("}");
+  }
+}
+
+typedef struct kmp_abt_parse_aname {
+  parse_word_t word;
+  int num_places;
+} kmp_abt_parse_aname_t;
+
+static kmp_abt_parse_aname_t *__kmp_abt_parse_aname_create_p(parse_word_t word,
+                                                             int num_places) {
+  kmp_abt_parse_aname_t *p_aname
+      = (kmp_abt_parse_aname_t *)malloc(sizeof(kmp_abt_parse_aname_t));
+  p_aname->word = word;
+  p_aname->num_places = num_places;
+  return p_aname;
+}
+
+static kmp_abt_parse_aname_t *__kmp_abt_parse_aname_create(parse_word_t word) {
+  return __kmp_abt_parse_aname_create_p(word, -1);
+}
+
+static void __kmp_abt_parse_aname_free(kmp_abt_parse_aname_t *p_aname) {
+  free(p_aname);
+}
+
+static kmp_abt_parse_aname_t *__kmp_abt_parse_aname_parse(const char *str,
+                                                          size_t len,
+                                                          size_t *p_consume) {
+  size_t consume = 0;
+  parse_word_t word;
+  if (len >= 7 && strncmp(str, "sockets", 7) == 0) {
+    consume = 7;
+    word = word_sockets;
+  } else if (len >= 5 && strncmp(str, "cores", 5) == 0) {
+    consume = 5;
+    word = word_cores;
+  } else if (len >= 7 && strncmp(str, "threads", 7) == 0) {
+    consume = 7;
+    word = word_threads;
+  } else {
+    return NULL;
+  }
+  if (len != consume && str[consume] == '(') {
+    consume++; // Consume "("
+    int num_places = 0;
+    if (!__kmp_abt_parse_num(&num_places, num_positive, str + consume,
+                             len - consume, &consume)) {
+      return NULL;
+    }
+    if (len != consume && str[consume] == ')') {
+      consume++; // Consume ")"
+      *p_consume += consume;
+      return __kmp_abt_parse_aname_create_p(word, num_places);
+    }
+  } else {
+    *p_consume += consume;
+    return __kmp_abt_parse_aname_create(word);
+  }
+  return NULL;
+}
+
+static kmp_abt_affinity_places_t *__kmp_abt_parse_aname_generate_places
+    (const kmp_abt_parse_aname_t *p_aname, int num_xstreams) {
+  kmp_abt_affinity_places_t *p_places = __kmp_abt_affinity_places_create();
+  if (p_aname->word == word_sockets || p_aname->num_places == -1) {
+    // Ignore.
+    for (int i = 0; i < num_xstreams; i++)
+      __kmp_abt_affinity_places_add(p_places,
+                                    __kmp_abt_affinity_place_create_rank(i));
+  } else {
+    for (int i = 0; i < p_aname->num_places; i++) {
+      int jstart = num_xstreams * i / p_aname->num_places;
+      int jend = num_xstreams * (i + 1) / p_aname->num_places;
+      kmp_abt_affinity_place_t *p_place = __kmp_abt_affinity_place_create();
+      for (int j = jstart; j < jend; j++)
+        __kmp_abt_affinity_place_insert(p_place, j);
+      __kmp_abt_affinity_places_add(p_places, p_place);
+    }
+  }
+  return p_places;
+}
+
+static void __kmp_abt_parse_aname_print(const kmp_abt_parse_aname_t *p_aname) {
+  if (p_aname->word == word_sockets) {
+    printf("sockets");
+  } else if (p_aname->word == word_cores) {
+    printf("cores");
+  } else {
+    printf("threads");
+  }
+  if (p_aname->num_places != -1) {
+    printf("(%d)", (int)p_aname->num_places);
+  }
+}
+
+typedef struct kmp_abt_parse_plist {
+  size_t len_pintervals;
+  kmp_abt_parse_pinterval_t **p_pintervals;
+} kmp_abt_parse_plist_t;
+
+static kmp_abt_parse_plist_t *__kmp_abt_parse_plist_create() {
+  kmp_abt_parse_plist_t *p_plist
+      = (kmp_abt_parse_plist_t *)malloc(sizeof(kmp_abt_parse_plist_t));
+  p_plist->len_pintervals = 0;
+  p_plist->p_pintervals = NULL;
+  return p_plist;
+}
+
+static void __kmp_abt_parse_plist_free(kmp_abt_parse_plist_t *p_plist) {
+  for (size_t i = 0; i < p_plist->len_pintervals; i++)
+    __kmp_abt_parse_pinterval_free(p_plist->p_pintervals[i]);
+  free(p_plist->p_pintervals);
+  free(p_plist);
+}
+
+static void __kmp_abt_parse_plist_add(kmp_abt_parse_plist_t *p_plist,
+                                      kmp_abt_parse_pinterval_t *p_pinterval) {
+  size_t len_pintervals = p_plist->len_pintervals;
+  size_t new_len_pintervals = len_pintervals + 1;
+  kmp_abt_parse_pinterval_t **p_new_pintervals
+      = (kmp_abt_parse_pinterval_t **)malloc(sizeof(kmp_abt_parse_pinterval_t *)
+                                             * new_len_pintervals);
+  memcpy(p_new_pintervals, p_plist->p_pintervals,
+         sizeof(kmp_abt_parse_pinterval_t *) * len_pintervals);
+  free(p_plist->p_pintervals);
+  p_new_pintervals[len_pintervals] = p_pinterval;
+  p_plist->len_pintervals = new_len_pintervals;
+  p_plist->p_pintervals = p_new_pintervals;
+}
+
+static kmp_abt_parse_plist_t *__kmp_abt_parse_plist_parse(const char *str,
+                                                          size_t len,
+                                                          size_t *p_consume) {
+  kmp_abt_parse_plist_t *p_plist = __kmp_abt_parse_plist_create();
+  size_t consume = 0;
+  while(1) {
+    kmp_abt_parse_pinterval_t *p_pinterval
+        = __kmp_abt_parse_pinterval_parse(str + consume, len - consume,
+                                          &consume);
+    if (!p_pinterval) {
+      __kmp_abt_parse_plist_free(p_plist);
+      return NULL;
+    }
+    __kmp_abt_parse_plist_add(p_plist, p_pinterval);
+    if (consume == len || str[consume] != ',') {
+      *p_consume += consume;
+      return p_plist;
+    }
+    consume++; // Consume ","
+  }
+  // Unreachable.
+}
+
+static kmp_abt_affinity_places_t *__kmp_abt_parse_plist_generate_places
+    (const kmp_abt_parse_plist_t *p_plist, int num_xstreams) {
+  kmp_abt_affinity_places_t *p_places =  __kmp_abt_affinity_places_create();
+  for (size_t i = 0; i < p_plist->len_pintervals; i++) {
+    kmp_abt_affinity_places_t *p_ret_places
+        = __kmp_abt_parse_pinterval_generate_places(p_plist->p_pintervals[i],
+                                                    num_xstreams);
+    for (size_t j = 0; j < p_ret_places->num_places; j++) {
+      kmp_abt_affinity_place_t *p_place
+        = __kmp_abt_affinity_place_create_place(p_ret_places->p_places[j]);
+      __kmp_abt_affinity_places_add(p_places, p_place);
+    }
+    __kmp_abt_affinity_places_free(p_ret_places);
+  }
+  return p_places;
+}
+
+static void __kmp_abt_parse_plist_print(const kmp_abt_parse_plist_t *p_plist) {
+  int index = 0;
+  for (size_t i = 0; i < p_plist->len_pintervals; i++) {
+    if (index++ != 0)
+      printf(",");
+    __kmp_abt_parse_pinterval_print(p_plist->p_pintervals[i]);
+  }
+}
+
+typedef struct kmp_abt_parse_list {
+  kmp_abt_parse_plist_t *p_plist;
+  kmp_abt_parse_aname_t *p_aname;
+} kmp_abt_parse_list_t;
+
+static kmp_abt_parse_list_t *__kmp_abt_parse_list_create
+    (kmp_abt_parse_plist_t *p_plist, kmp_abt_parse_aname_t *p_aname) {
+  kmp_abt_parse_list_t *p_list
+      = (kmp_abt_parse_list_t *)malloc(sizeof(kmp_abt_parse_list_t));
+  p_list->p_plist = p_plist;
+  p_list->p_aname = p_aname;
+  return p_list;
+}
+
+static kmp_abt_parse_list_t *__kmp_abt_parse_list_create_plist
+    (kmp_abt_parse_plist_t *p_plist) {
+  return __kmp_abt_parse_list_create(p_plist, NULL);
+}
+
+static kmp_abt_parse_list_t *__kmp_abt_parse_list_create_aname
+    (kmp_abt_parse_aname_t *p_aname) {
+  return __kmp_abt_parse_list_create(NULL, p_aname);
+}
+
+static void __kmp_abt_parse_list_free(kmp_abt_parse_list_t *p_list) {
+  if (p_list->p_plist)
+    __kmp_abt_parse_plist_free(p_list->p_plist);
+  if (p_list->p_aname)
+    __kmp_abt_parse_aname_free(p_list->p_aname);
+  free(p_list);
+}
+
+static kmp_abt_parse_list_t *__kmp_abt_parse_list_parse(const char *str,
+                                                        size_t len,
+                                                        size_t *p_consume) {
+  kmp_abt_parse_plist_t *p_plist = __kmp_abt_parse_plist_parse(str, len,
+                                                               p_consume);
+  if (p_plist)
+    return __kmp_abt_parse_list_create_plist(p_plist);
+  kmp_abt_parse_aname_t *p_aname = __kmp_abt_parse_aname_parse(str, len,
+                                                               p_consume);
+  if (p_aname)
+    return __kmp_abt_parse_list_create_aname(p_aname);
+  return NULL;
+}
+
+static kmp_abt_affinity_places_t *__kmp_abt_parse_list_generate_places
+    (const kmp_abt_parse_list_t *p_list, int num_xstreams) {
+  if (p_list->p_plist) {
+    return __kmp_abt_parse_plist_generate_places(p_list->p_plist, num_xstreams);
+  } else {
+    return __kmp_abt_parse_aname_generate_places(p_list->p_aname, num_xstreams);
+  }
+}
+
+static void __kmp_abt_parse_list_print(const kmp_abt_parse_list_t *p_list) {
+  if (p_list->p_plist) {
+    __kmp_abt_parse_plist_print(p_list->p_plist);
+  } else {
+    __kmp_abt_parse_aname_print(p_list->p_aname);
+  }
+}
+
+kmp_abt_affinity_places_t *__kmp_abt_parse_affinity(int num_xstreams,
+                                                    const char *str, size_t len,
+                                                    bool verbose) {
+  // <list> |= <p-list> | <aname>
+  // <p-list> |= <p-interval> | <p-list>,<p-interval>
+  // <p-interval> |= <place>:<len>:<stride> | <place>:<len> | <place> | !<place>
+  // <place> |= {<res-list>}
+  // <res-list> |= <res-interval> | <res-list>,<res-interval>
+  // <res-interval> |= <res>:<num-places>:<stride> | <res>:<num-places> | <res>
+  //                   | !<res>
+  // <aname> |= <word>(<num-places>) | <word>
+  // <word> |= sockets | cores | threads
+  //           | <implementation-defined abstract name>
+  // <res> |= non-negative integer
+  // <num-places> |= positive integer
+  // <stride> |= integer
+  // <len> |= positive integer
+
+  size_t consume = 0;
+  kmp_abt_parse_list_t *p_list = __kmp_abt_parse_list_parse(str, len, &consume);
+  kmp_abt_affinity_places_t *p_places = NULL;
+  if (!p_list) {
+    if (verbose) {
+      printf("parse failed:\n");
+      printf("use default places.\n");
+    }
+    // Create a default one.
+    p_places = __kmp_abt_affinity_places_create();
+    for (int i = 0; i < num_xstreams; i++) {
+      kmp_abt_affinity_place_t *p_place
+          = __kmp_abt_affinity_place_create_rank(i);
+      __kmp_abt_affinity_places_add(p_places, p_place);
+    }
+  } else {
+    if (verbose) {
+      printf("parse succeeded:\n");
+      printf("  %s\n->", str);
+      __kmp_abt_parse_list_print(p_list);
+      printf("\n");
+    }
+    p_places = __kmp_abt_parse_list_generate_places(p_list, num_xstreams);
+    __kmp_abt_parse_list_free(p_list);
+  }
+  if (verbose) {
+    for (size_t i = 0; i < p_places->num_places; i++) {
+      if (i != 0)
+        printf(",");
+      printf("[%d]:{", (int)i);
+      bool is_first = true;
+      kmp_abt_affinity_place_t *p_place = p_places->p_places[i];
+      for (size_t j = 0; j < p_place->num_ranks; j++) {
+        if (!is_first)
+          printf(",");
+        is_first = false;
+        printf("%d", p_place->ranks[j]);
+      }
+      printf("}");
+    }
+    printf("\n");
+  }
+  return p_places;
+}
diff --git a/runtime/src/kmp_affinity.cpp b/runtime/src/kmp_affinity.cpp
index 4c7ed3181..103dc269d 100644
--- a/runtime/src/kmp_affinity.cpp
+++ b/runtime/src/kmp_affinity.cpp
@@ -40,7 +40,8 @@ void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
   KMP_DEBUG_ASSERT(depth > 0);
 
   thr_bar->depth = depth;
-  thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0] - 1;
+  __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1,
+                     &(thr_bar->base_leaf_kids));
   thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
 }
 
@@ -130,14 +131,13 @@ char *__kmp_affinity_print_mask(char *buf, int buf_len,
     }
     // Range with three or more contiguous bits in the affinity mask
     if (previous - start > 1) {
-      KMP_SNPRINTF(scan, end - scan + 1, "%d-%d", static_cast<int>(start),
-                   static_cast<int>(previous));
+      KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous);
     } else {
       // Range with one or two contiguous bits in the affinity mask
-      KMP_SNPRINTF(scan, end - scan + 1, "%d", static_cast<int>(start));
+      KMP_SNPRINTF(scan, end - scan + 1, "%u", start);
       KMP_ADVANCE_SCAN(scan);
       if (previous - start > 0) {
-        KMP_SNPRINTF(scan, end - scan + 1, ",%d", static_cast<int>(previous));
+        KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous);
       }
     }
     KMP_ADVANCE_SCAN(scan);
@@ -195,13 +195,12 @@ kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
     }
     // Range with three or more contiguous bits in the affinity mask
     if (previous - start > 1) {
-      __kmp_str_buf_print(buf, "%d-%d", static_cast<int>(start),
-                          static_cast<int>(previous));
+      __kmp_str_buf_print(buf, "%u-%u", start, previous);
     } else {
       // Range with one or two contiguous bits in the affinity mask
-      __kmp_str_buf_print(buf, "%d", static_cast<int>(start));
+      __kmp_str_buf_print(buf, "%u", start);
       if (previous - start > 0) {
-        __kmp_str_buf_print(buf, ",%d", static_cast<int>(previous));
+        __kmp_str_buf_print(buf, ",%u", previous);
       }
     }
     // Start over with new start point
@@ -577,12 +576,20 @@ static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
     // Hack to try and infer the machine topology using only the data
     // available from cpuid on the current thread, and __kmp_xproc.
     KMP_ASSERT(__kmp_affinity_type == affinity_none);
-
-    nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(
-        hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0), HWLOC_OBJ_CORE);
-    __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(
-        hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU);
+    // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
+    hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0);
+    if (o != NULL)
+      nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE);
+    else
+      nCoresPerPkg = 1; // no PACKAGE found
+    o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0);
+    if (o != NULL)
+      __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU);
+    else
+      __kmp_nThreadsPerCore = 1; // no CORE found
     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
+    if (nCoresPerPkg == 0)
+      nCoresPerPkg = 1; // to prevent possible division by 0
     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
     if (__kmp_affinity_verbose) {
       KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
@@ -601,7 +608,7 @@ static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
 
   int depth = 3;
   int levels[5] = {0, 1, 2, 3, 4}; // package, [node,] [tile,] core, thread
-  int labels[3] = {0}; // package [,node] [,tile] - head of lables array
+  int labels[3] = {0}; // package [,node] [,tile] - head of labels array
   if (__kmp_numa_detected)
     ++depth;
   if (__kmp_tile_depth)
@@ -721,15 +728,7 @@ static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
     __kmp_ncores = nPackages = 1;
     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
     if (__kmp_affinity_verbose) {
-      char buf[KMP_AFFIN_MASK_PRINT_LEN];
-      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
-
       KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
-      if (__kmp_affinity_respect_mask) {
-        KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
-      } else {
-        KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
-      }
       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
       KMP_INFORM(Uniform, "KMP_AFFINITY");
       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
@@ -783,13 +782,6 @@ static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
 
   // Print the machine topology summary.
   if (__kmp_affinity_verbose) {
-    char mask[KMP_AFFIN_MASK_PRINT_LEN];
-    __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
-    if (__kmp_affinity_respect_mask) {
-      KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
-    } else {
-      KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
-    }
     KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
     if (uniform) {
       KMP_INFORM(Uniform, "KMP_AFFINITY");
@@ -828,7 +820,7 @@ static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
   }
 
   int depth_full = depth; // number of levels before compressing
-  // Find any levels with radiix 1, and remove them from the map
+  // Find any levels with radix 1, and remove them from the map
   // (except for the package level).
   depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth,
                                                  levels);
@@ -890,16 +882,7 @@ static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
   __kmp_ncores = nPackages = __kmp_avail_proc;
   __kmp_nThreadsPerCore = nCoresPerPkg = 1;
   if (__kmp_affinity_verbose) {
-    char buf[KMP_AFFIN_MASK_PRINT_LEN];
-    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-                              __kmp_affin_fullMask);
-
     KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
-    if (__kmp_affinity_respect_mask) {
-      KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
-    } else {
-      KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
-    }
     KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
     KMP_INFORM(Uniform, "KMP_AFFINITY");
     KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
@@ -918,7 +901,7 @@ static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
     return 0;
   }
 
-  // Contruct the data structure to be returned.
+  // Construct the data structure to be returned.
   *address2os =
       (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
   int avail_ct = 0;
@@ -967,7 +950,7 @@ static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
     return -1;
   }
 
-  // Contruct the data structure to be returned.
+  // Construct the data structure to be returned.
   *address2os =
       (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
@@ -1124,7 +1107,7 @@ static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
     // - Older OSes are usually found on machines with older chips, which do not
     //   support HT.
     // - The performance penalty for mistakenly identifying a machine as HT when
-    //   it isn't (which results in blocktime being incorrecly set to 0) is
+    //   it isn't (which results in blocktime being incorrectly set to 0) is
     //   greater than the penalty when for mistakenly identifying a machine as
     //   being 1 thread/core when it is really HT enabled (which results in
     //   blocktime being incorrectly set to a positive value).
@@ -1265,15 +1248,7 @@ static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
     __kmp_ncores = nPackages = 1;
     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
     if (__kmp_affinity_verbose) {
-      char buf[KMP_AFFIN_MASK_PRINT_LEN];
-      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
-
       KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
-      if (__kmp_affinity_respect_mask) {
-        KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
-      } else {
-        KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
-      }
       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
       KMP_INFORM(Uniform, "KMP_AFFINITY");
       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
@@ -1398,15 +1373,7 @@ static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
   // not enabled.
   __kmp_ncores = nCores;
   if (__kmp_affinity_verbose) {
-    char buf[KMP_AFFIN_MASK_PRINT_LEN];
-    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
-
     KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
-    if (__kmp_affinity_respect_mask) {
-      KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
-    } else {
-      KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
-    }
     KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
     if (__kmp_affinity_uniform_topology()) {
       KMP_INFORM(Uniform, "KMP_AFFINITY");
@@ -1681,15 +1648,7 @@ static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
     __kmp_ncores = nPackages = 1;
     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
     if (__kmp_affinity_verbose) {
-      char buf[KMP_AFFIN_MASK_PRINT_LEN];
-      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
-
       KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
-      if (__kmp_affinity_respect_mask) {
-        KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
-      } else {
-        KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
-      }
       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
       KMP_INFORM(Uniform, "KMP_AFFINITY");
       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
@@ -1805,15 +1764,7 @@ static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
 
   // Print the machine topology summary.
   if (__kmp_affinity_verbose) {
-    char mask[KMP_AFFIN_MASK_PRINT_LEN];
-    __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
-
     KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
-    if (__kmp_affinity_respect_mask) {
-      KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
-    } else {
-      KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
-    }
     KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
     if (uniform) {
       KMP_INFORM(Uniform, "KMP_AFFINITY");
@@ -1849,7 +1800,7 @@ static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
     return 0;
   }
 
-  // Find any levels with radiix 1, and remove them from the map
+  // Find any levels with radix 1, and remove them from the map
   // (except for the package level).
   int new_depth = 0;
   for (level = 0; level < depth; level++) {
@@ -1968,7 +1919,8 @@ static void __kmp_dispatch_set_hierarchy_values() {
   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
       nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
-#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS)
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
+    KMP_MIC_SUPPORTED
   if (__kmp_mic_type >= mic3)
     __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
   else
@@ -1982,7 +1934,8 @@ static void __kmp_dispatch_set_hierarchy_values() {
   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
       __kmp_nThreadsPerCore;
-#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS)
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
+    KMP_MIC_SUPPORTED
   if (__kmp_mic_type >= mic3)
     __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
         2 * __kmp_nThreadsPerCore;
@@ -2076,7 +2029,7 @@ static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
     return -1;
   }
 
-  // Set the file pointer back to the begginning, so that we can scan the file
+  // Set the file pointer back to the beginning, so that we can scan the file
   // again, this time performing a full parse of the data. Allocate a vector of
   // ProcCpuInfo object, where we will place the data. Adding an extra element
   // at the end allows us to remove a lot of extra checks for termination
@@ -2322,15 +2275,7 @@ static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
         KMP_INFORM(Uniform, "KMP_AFFINITY");
       } else {
-        char buf[KMP_AFFIN_MASK_PRINT_LEN];
-        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-                                  __kmp_affin_fullMask);
         KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
-        if (__kmp_affinity_respect_mask) {
-          KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
-        } else {
-          KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
-        }
         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
         KMP_INFORM(Uniform, "KMP_AFFINITY");
       }
@@ -2461,7 +2406,7 @@ static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
             threadInfo[i][threadIdIndex] = threadIdCt++;
           }
 
-          // Aparrently the thread id field was specified for some entries and
+          // Apparently the thread id field was specified for some entries and
           // not others. Start the thread id counter off at the next higher
           // thread id.
           else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
@@ -2531,15 +2476,7 @@ static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
         KMP_INFORM(NonUniform, "KMP_AFFINITY");
       }
     } else {
-      char buf[KMP_AFFIN_MASK_PRINT_LEN];
-      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
-                                __kmp_affin_fullMask);
       KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
-      if (__kmp_affinity_respect_mask) {
-        KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
-      } else {
-        KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
-      }
       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
       if (uniform) {
         KMP_INFORM(Uniform, "KMP_AFFINITY");
@@ -3977,14 +3914,6 @@ static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) {
     *pAddr = newAddr; // replace old topology with new one
   }
   if (__kmp_affinity_verbose) {
-    char m[KMP_AFFIN_MASK_PRINT_LEN];
-    __kmp_affinity_print_mask(m, KMP_AFFIN_MASK_PRINT_LEN,
-                              __kmp_affin_fullMask);
-    if (__kmp_affinity_respect_mask) {
-      KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m);
-    } else {
-      KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m);
-    }
     KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc);
     kmp_str_buf_t buf;
     __kmp_str_buf_init(&buf);
@@ -4146,9 +4075,8 @@ static void __kmp_aux_affinity_initialize(void) {
     KMP_CPU_ALLOC(__kmp_affin_fullMask);
   }
   if (KMP_AFFINITY_CAPABLE()) {
+    __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
     if (__kmp_affinity_respect_mask) {
-      __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
-
       // Count the number of available processors.
       unsigned i;
       __kmp_avail_proc = 0;
@@ -4168,9 +4096,27 @@ static void __kmp_aux_affinity_initialize(void) {
         KMP_AFFINITY_DISABLE();
         return;
       }
+
+      if (__kmp_affinity_verbose) {
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                  __kmp_affin_fullMask);
+        KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+      }
     } else {
+      if (__kmp_affinity_verbose) {
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                  __kmp_affin_fullMask);
+        KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+      }
       __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
       __kmp_avail_proc = __kmp_xproc;
+#if KMP_OS_WINDOWS
+      // Set the process affinity mask since threads' affinity
+      // masks must be subset of process mask in Windows* OS
+      __kmp_affin_fullMask->set_process_affinity(true);
+#endif
     }
   }
 
@@ -4194,7 +4140,7 @@ static void __kmp_aux_affinity_initialize(void) {
   if (__kmp_affinity_top_method == affinity_top_method_all) {
     // In the default code path, errors are not fatal - we just try using
     // another method. We only emit a warning message if affinity is on, or the
-    // verbose flag is set, an the nowarnings flag was not set.
+    // verbose flag is set, and the nowarnings flag was not set.
     const char *file_name = NULL;
     int line = 0;
 #if KMP_USE_HWLOC
@@ -4263,17 +4209,10 @@ static void __kmp_aux_affinity_initialize(void) {
         }
       }
 
-      FILE *f = fopen("/proc/cpuinfo", "r");
-      if (f == NULL) {
-        msg_id = kmp_i18n_str_CantOpenCpuinfo;
-      } else {
-        file_name = "/proc/cpuinfo";
-        depth =
-            __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
-        fclose(f);
-        if (depth == 0) {
-          KMP_EXIT_AFF_NONE;
-        }
+      kmp_safe_raii_file_t f("/proc/cpuinfo", "r");
+      depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
+      if (depth == 0) {
+        KMP_EXIT_AFF_NONE;
       }
     }
 
@@ -4328,7 +4267,7 @@ static void __kmp_aux_affinity_initialize(void) {
   }
 #endif // KMP_USE_HWLOC
 
-// If the user has specified that a paricular topology discovery method is to be
+// If the user has specified that a particular topology discovery method is to be
 // used, then we abort if that method fails. The exception is group affinity,
 // which might have been implicitly set.
 
@@ -4366,8 +4305,10 @@ static void __kmp_aux_affinity_initialize(void) {
 
   else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
     const char *filename;
+    const char *env_var = nullptr;
     if (__kmp_cpuinfo_file != NULL) {
       filename = __kmp_cpuinfo_file;
+      env_var = "KMP_CPUINFO_FILE";
     } else {
       filename = "/proc/cpuinfo";
     }
@@ -4376,20 +4317,9 @@ static void __kmp_aux_affinity_initialize(void) {
       KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
     }
 
-    FILE *f = fopen(filename, "r");
-    if (f == NULL) {
-      int code = errno;
-      if (__kmp_cpuinfo_file != NULL) {
-        __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code),
-                    KMP_HNT(NameComesFrom_CPUINFO_FILE), __kmp_msg_null);
-      } else {
-        __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code),
-                    __kmp_msg_null);
-      }
-    }
+    kmp_safe_raii_file_t f(filename, "r", env_var);
     int line = 0;
     depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
-    fclose(f);
     if (depth < 0) {
       KMP_ASSERT(msg_id != kmp_i18n_null);
       if (line > 0) {
@@ -4647,7 +4577,7 @@ static void __kmp_aux_affinity_initialize(void) {
 #undef KMP_EXIT_AFF_NONE
 
 void __kmp_affinity_initialize(void) {
-  // Much of the code above was written assumming that if a machine was not
+  // Much of the code above was written assuming that if a machine was not
   // affinity capable, then __kmp_affinity_type == affinity_none.  We now
   // explicitly represent this as __kmp_affinity_type == affinity_disabled.
   // There are too many checks for __kmp_affinity_type == affinity_none
@@ -4713,7 +4643,7 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
     KMP_CPU_ZERO(th->th.th_affin_mask);
   }
 
-  // Copy the thread mask to the kmp_info_t strucuture. If
+  // Copy the thread mask to the kmp_info_t structure. If
   // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that
   // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set,
   // then the full mask is the same as the mask of the initialization thread.
@@ -4823,7 +4753,7 @@ void __kmp_affinity_set_place(int gtid) {
                (th->th.th_new_place >= th->th.th_last_place));
   }
 
-  // Copy the thread mask to the kmp_info_t strucuture,
+  // Copy the thread mask to the kmp_info_t structure,
   // and set this thread's affinity.
   kmp_affin_mask_t *mask =
       KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place);
@@ -5300,7 +5230,7 @@ void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
   }
 }
 
-#if KMP_OS_LINUX
+#if KMP_OS_LINUX || KMP_OS_FREEBSD
 // We don't need this entry for Windows because
 // there is GetProcessAffinityMask() api
 //
diff --git a/runtime/src/kmp_affinity.h b/runtime/src/kmp_affinity.h
index f270bb6db..013080bbc 100644
--- a/runtime/src/kmp_affinity.h
+++ b/runtime/src/kmp_affinity.h
@@ -54,7 +54,7 @@ class KMPHwlocAffinity : public KMPAffinity {
     int get_system_affinity(bool abort_on_error) override {
       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
                   "Illegal get affinity operation when not capable");
-      int retval =
+      long retval =
           hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
       if (retval >= 0) {
         return 0;
@@ -67,8 +67,8 @@ class KMPHwlocAffinity : public KMPAffinity {
     }
     int set_system_affinity(bool abort_on_error) const override {
       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
-                  "Illegal get affinity operation when not capable");
-      int retval =
+                  "Illegal set affinity operation when not capable");
+      long retval =
           hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
       if (retval >= 0) {
         return 0;
@@ -79,6 +79,26 @@ class KMPHwlocAffinity : public KMPAffinity {
       }
       return error;
     }
+#if KMP_OS_WINDOWS
+    int set_process_affinity(bool abort_on_error) const override {
+      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                  "Illegal set process affinity operation when not capable");
+      int error = 0;
+      const hwloc_topology_support *support =
+          hwloc_topology_get_support(__kmp_hwloc_topology);
+      if (support->cpubind->set_proc_cpubind) {
+        int retval;
+        retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
+                                   HWLOC_CPUBIND_PROCESS);
+        if (retval >= 0)
+          return 0;
+        error = errno;
+        if (abort_on_error)
+          __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
+      }
+      return error;
+    }
+#endif
     int get_proc_group() const override {
       int group = -1;
 #if KMP_OS_WINDOWS
@@ -241,8 +261,13 @@ class KMPHwlocAffinity : public KMPAffinity {
 #endif
 class KMPNativeAffinity : public KMPAffinity {
   class Mask : public KMPAffinity::Mask {
-    typedef unsigned char mask_t;
-    static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
+    typedef unsigned long mask_t;
+    typedef decltype(__kmp_affin_mask_size) mask_size_type;
+    static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
+    static const mask_t ONE = 1;
+    mask_size_type get_num_mask_types() const {
+      return __kmp_affin_mask_size / sizeof(mask_t);
+    }
 
   public:
     mask_t *mask;
@@ -252,35 +277,40 @@ class KMPNativeAffinity : public KMPAffinity {
         __kmp_free(mask);
     }
     void set(int i) override {
-      mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
+      mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
     }
     bool is_set(int i) const override {
-      return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
+      return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
     }
     void clear(int i) override {
-      mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
+      mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
     }
     void zero() override {
-      for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
-        mask[i] = 0;
+      mask_size_type e = get_num_mask_types();
+      for (mask_size_type i = 0; i < e; ++i)
+        mask[i] = (mask_t)0;
     }
     void copy(const KMPAffinity::Mask *src) override {
       const Mask *convert = static_cast<const Mask *>(src);
-      for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
+      mask_size_type e = get_num_mask_types();
+      for (mask_size_type i = 0; i < e; ++i)
         mask[i] = convert->mask[i];
     }
     void bitwise_and(const KMPAffinity::Mask *rhs) override {
       const Mask *convert = static_cast<const Mask *>(rhs);
-      for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
+      mask_size_type e = get_num_mask_types();
+      for (mask_size_type i = 0; i < e; ++i)
         mask[i] &= convert->mask[i];
     }
     void bitwise_or(const KMPAffinity::Mask *rhs) override {
       const Mask *convert = static_cast<const Mask *>(rhs);
-      for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
+      mask_size_type e = get_num_mask_types();
+      for (mask_size_type i = 0; i < e; ++i)
         mask[i] |= convert->mask[i];
     }
     void bitwise_not() override {
-      for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
+      mask_size_type e = get_num_mask_types();
+      for (mask_size_type i = 0; i < e; ++i)
         mask[i] = ~(mask[i]);
     }
     int begin() const override {
@@ -289,7 +319,11 @@ class KMPNativeAffinity : public KMPAffinity {
         ++retval;
       return retval;
     }
-    int end() const override { return __kmp_affin_mask_size * BITS_PER_MASK_T; }
+    int end() const override {
+      int e;
+      __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
+      return e;
+    }
     int next(int previous) const override {
       int retval = previous + 1;
       while (retval < end() && !is_set(retval))
@@ -300,11 +334,12 @@ class KMPNativeAffinity : public KMPAffinity {
       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
                   "Illegal get affinity operation when not capable");
 #if KMP_OS_LINUX
-      int retval =
+      long retval =
           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
 #elif KMP_OS_FREEBSD
-      int retval =
+      int r =
           pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask));
+      int retval = (r == 0 ? 0 : -1);
 #endif
       if (retval >= 0) {
         return 0;
@@ -317,13 +352,14 @@ class KMPNativeAffinity : public KMPAffinity {
     }
     int set_system_affinity(bool abort_on_error) const override {
       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
-                  "Illegal get affinity operation when not capable");
+                  "Illegal set affinity operation when not capable");
 #if KMP_OS_LINUX
-      int retval =
+      long retval =
           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
 #elif KMP_OS_FREEBSD
-      int retval =
+      int r =
           pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask));
+      int retval = (r == 0 ? 0 : -1);
 #endif
       if (retval >= 0) {
         return 0;
@@ -424,6 +460,19 @@ class KMPNativeAffinity : public KMPAffinity {
         ++retval;
       return retval;
     }
+    int set_process_affinity(bool abort_on_error) const override {
+      if (__kmp_num_proc_groups <= 1) {
+        if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
+                        __kmp_msg_null);
+          }
+          return error;
+        }
+      }
+      return 0;
+    }
     int set_system_affinity(bool abort_on_error) const override {
       if (__kmp_num_proc_groups > 1) {
         // Check for a valid mask.
@@ -814,15 +863,15 @@ class hierarchy_info {
       skipPerLevel = &(numPerLevel[maxLevels]);
 
       // Copy old elements from old arrays
-      for (kmp_uint32 i = 0; i < old_maxLevels;
-           ++i) { // init numPerLevel[*] to 1 item per level
+      for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
+        // init numPerLevel[*] to 1 item per level
         numPerLevel[i] = old_numPerLevel[i];
         skipPerLevel[i] = old_skipPerLevel[i];
       }
 
       // Init new elements in arrays to 1
-      for (kmp_uint32 i = old_maxLevels; i < maxLevels;
-           ++i) { // init numPerLevel[*] to 1 item per level
+      for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
+        // init numPerLevel[*] to 1 item per level
         numPerLevel[i] = 1;
         skipPerLevel[i] = 1;
       }
diff --git a/runtime/src/kmp_alloc.cpp b/runtime/src/kmp_alloc.cpp
index 861940120..31981d5c1 100644
--- a/runtime/src/kmp_alloc.cpp
+++ b/runtime/src/kmp_alloc.cpp
@@ -57,7 +57,7 @@ static void bectl(kmp_info_t *th, bget_compact_t compact,
    multiple of this size.  This MUST be a power of two. */
 
 /* On IA-32 architecture with  Linux* OS, malloc() does not
-   ensure 16 byte alignmnent */
+   ensure 16 byte alignment */
 
 #if KMP_ARCH_X86 || !KMP_HAVE_QUAD
 
@@ -186,7 +186,7 @@ typedef struct thr_data {
                        -1: not all pool blocks are the same size
                        >0: (common) block size for all bpool calls made so far
                     */
-  bfhead_t *last_pool; /* Last pool owned by this thread (delay dealocation) */
+  bfhead_t *last_pool; /* Last pool owned by this thread (delay deallocation) */
 } thr_data_t;
 
 /*  Minimum allocation quantum: */
@@ -195,7 +195,7 @@ typedef struct thr_data {
 #define MaxSize                                                                \
   (bufsize)(                                                                   \
       ~(((bufsize)(1) << (sizeof(bufsize) * CHAR_BIT - 1)) | (SizeQuant - 1)))
-// Maximun for the requested size.
+// Maximum for the requested size.
 
 /* End sentinel: value placed in bsize field of dummy block delimiting
    end of pool block.  The most negative number which will  fit  in  a
@@ -577,7 +577,7 @@ static void *bget(kmp_info_t *th, bufsize requested_size) {
   if (thr->acqfcn != 0) {
     if (size > (bufsize)(thr->exp_incr - sizeof(bhead_t))) {
       /* Request is too large to fit in a single expansion block.
-         Try to satisy it by a direct buffer acquisition. */
+         Try to satisfy it by a direct buffer acquisition. */
       bdhead_t *bdh;
 
       size += sizeof(bdhead_t) - sizeof(bhead_t);
@@ -1239,6 +1239,9 @@ static void **mk_hbw_preferred;
 static void **mk_hugetlb;
 static void **mk_hbw_hugetlb;
 static void **mk_hbw_preferred_hugetlb;
+static void **mk_dax_kmem;
+static void **mk_dax_kmem_all;
+static void **mk_dax_kmem_preferred;
 
 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
 static inline void chk_kind(void ***pkind) {
@@ -1279,25 +1282,21 @@ void __kmp_init_memkind() {
       mk_hbw_preferred_hugetlb =
           (void **)dlsym(h_memkind, "MEMKIND_HBW_PREFERRED_HUGETLB");
       chk_kind(&mk_hbw_preferred_hugetlb);
+      mk_dax_kmem = (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM");
+      chk_kind(&mk_dax_kmem);
+      mk_dax_kmem_all = (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM_ALL");
+      chk_kind(&mk_dax_kmem_all);
+      mk_dax_kmem_preferred =
+          (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM_PREFERRED");
+      chk_kind(&mk_dax_kmem_preferred);
       KE_TRACE(25, ("__kmp_init_memkind: memkind library initialized\n"));
       return; // success
     }
     dlclose(h_memkind); // failure
-    h_memkind = NULL;
   }
-  kmp_mk_check = NULL;
-  kmp_mk_alloc = NULL;
-  kmp_mk_free = NULL;
-  mk_default = NULL;
-  mk_interleave = NULL;
-  mk_hbw = NULL;
-  mk_hbw_interleave = NULL;
-  mk_hbw_preferred = NULL;
-  mk_hugetlb = NULL;
-  mk_hbw_hugetlb = NULL;
-  mk_hbw_preferred_hugetlb = NULL;
-#else
+#else // !(KMP_OS_UNIX && KMP_DYNAMIC_LIB)
   kmp_mk_lib_name = "";
+#endif // !(KMP_OS_UNIX && KMP_DYNAMIC_LIB)
   h_memkind = NULL;
   kmp_mk_check = NULL;
   kmp_mk_alloc = NULL;
@@ -1310,7 +1309,9 @@ void __kmp_init_memkind() {
   mk_hugetlb = NULL;
   mk_hbw_hugetlb = NULL;
   mk_hbw_preferred_hugetlb = NULL;
-#endif
+  mk_dax_kmem = NULL;
+  mk_dax_kmem_all = NULL;
+  mk_dax_kmem_preferred = NULL;
 }
 
 void __kmp_fini_memkind() {
@@ -1332,6 +1333,9 @@ void __kmp_fini_memkind() {
   mk_hugetlb = NULL;
   mk_hbw_hugetlb = NULL;
   mk_hbw_preferred_hugetlb = NULL;
+  mk_dax_kmem = NULL;
+  mk_dax_kmem_all = NULL;
+  mk_dax_kmem_preferred = NULL;
 #endif
 }
 
@@ -1348,27 +1352,27 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
   al->memspace = ms; // not used currently
   for (i = 0; i < ntraits; ++i) {
     switch (traits[i].key) {
-    case OMP_ATK_THREADMODEL:
-    case OMP_ATK_ACCESS:
-    case OMP_ATK_PINNED:
+    case omp_atk_sync_hint:
+    case omp_atk_access:
+    case omp_atk_pinned:
       break;
-    case OMP_ATK_ALIGNMENT:
-      al->alignment = traits[i].value;
+    case omp_atk_alignment:
+      __kmp_type_convert(traits[i].value, &(al->alignment));
       KMP_ASSERT(IS_POWER_OF_TWO(al->alignment));
       break;
-    case OMP_ATK_POOL_SIZE:
+    case omp_atk_pool_size:
       al->pool_size = traits[i].value;
       break;
-    case OMP_ATK_FALLBACK:
+    case omp_atk_fallback:
       al->fb = (omp_alloctrait_value_t)traits[i].value;
       KMP_DEBUG_ASSERT(
-          al->fb == OMP_ATV_DEFAULT_MEM_FB || al->fb == OMP_ATV_NULL_FB ||
-          al->fb == OMP_ATV_ABORT_FB || al->fb == OMP_ATV_ALLOCATOR_FB);
+          al->fb == omp_atv_default_mem_fb || al->fb == omp_atv_null_fb ||
+          al->fb == omp_atv_abort_fb || al->fb == omp_atv_allocator_fb);
       break;
-    case OMP_ATK_FB_DATA:
+    case omp_atk_fb_data:
       al->fb_data = RCAST(kmp_allocator_t *, traits[i].value);
       break;
-    case OMP_ATK_PARTITION:
+    case omp_atk_partition:
       al->memkind = RCAST(void **, traits[i].value);
       break;
     default:
@@ -1377,17 +1381,17 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
   }
   if (al->fb == 0) {
     // set default allocator
-    al->fb = OMP_ATV_DEFAULT_MEM_FB;
+    al->fb = omp_atv_default_mem_fb;
     al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;
-  } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
+  } else if (al->fb == omp_atv_allocator_fb) {
     KMP_ASSERT(al->fb_data != NULL);
-  } else if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
+  } else if (al->fb == omp_atv_default_mem_fb) {
     al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;
   }
   if (__kmp_memkind_available) {
     // Let's use memkind library if available
     if (ms == omp_high_bw_mem_space) {
-      if (al->memkind == (void *)OMP_ATV_INTERLEAVED && mk_hbw_interleave) {
+      if (al->memkind == (void *)omp_atv_interleaved && mk_hbw_interleave) {
         al->memkind = mk_hbw_interleave;
       } else if (mk_hbw_preferred) {
         // AC: do not try to use MEMKIND_HBW for now, because memkind library
@@ -1401,8 +1405,19 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
         __kmp_free(al);
         return omp_null_allocator;
       }
+    } else if (ms == omp_large_cap_mem_space) {
+      if (mk_dax_kmem_all) {
+        // All pmem nodes are visited
+        al->memkind = mk_dax_kmem_all;
+      } else if (mk_dax_kmem) {
+        // Only closest pmem node is visited
+        al->memkind = mk_dax_kmem;
+      } else {
+        __kmp_free(al);
+        return omp_null_allocator;
+      }
     } else {
-      if (al->memkind == (void *)OMP_ATV_INTERLEAVED && mk_interleave) {
+      if (al->memkind == (void *)omp_atv_interleaved && mk_interleave) {
         al->memkind = mk_interleave;
       } else {
         al->memkind = mk_default;
@@ -1436,6 +1451,7 @@ omp_allocator_handle_t __kmpc_get_default_allocator(int gtid) {
 typedef struct kmp_mem_desc { // Memory block descriptor
   void *ptr_alloc; // Pointer returned by allocator
   size_t size_a; // Size of allocated memory block (initial+descriptor+align)
+  size_t size_orig; // Original size requested
   void *ptr_align; // Pointer to aligned memory, returned
   kmp_allocator_t *allocator; // allocator
 } kmp_mem_desc_t;
@@ -1445,6 +1461,10 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
   void *ptr = NULL;
   kmp_allocator_t *al;
   KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  if (size == 0)
+    return NULL;
+
   if (allocator == omp_null_allocator)
     allocator = __kmp_threads[gtid]->th.th_def_allocator;
 
@@ -1460,6 +1480,7 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
   if (allocator > kmp_max_mem_alloc && al->alignment > 0) {
     align = al->alignment; // alignment requested by user
   }
+  desc.size_orig = size;
   desc.size_a = size + sz_desc + align;
 
   if (__kmp_memkind_available) {
@@ -1467,6 +1488,8 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
       // pre-defined allocator
       if (allocator == omp_high_bw_mem_alloc && mk_hbw_preferred) {
         ptr = kmp_mk_alloc(*mk_hbw_preferred, desc.size_a);
+      } else if (allocator == omp_large_cap_mem_alloc && mk_dax_kmem_all) {
+        ptr = kmp_mk_alloc(*mk_dax_kmem_all, desc.size_a);
       } else {
         ptr = kmp_mk_alloc(*mk_default, desc.size_a);
       }
@@ -1477,12 +1500,12 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
       if (used + desc.size_a > al->pool_size) {
         // not enough space, need to go fallback path
         KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
-        if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
+        if (al->fb == omp_atv_default_mem_fb) {
           al = (kmp_allocator_t *)omp_default_mem_alloc;
           ptr = kmp_mk_alloc(*mk_default, desc.size_a);
-        } else if (al->fb == OMP_ATV_ABORT_FB) {
+        } else if (al->fb == omp_atv_abort_fb) {
           KMP_ASSERT(0); // abort fallback requested
-        } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
+        } else if (al->fb == omp_atv_allocator_fb) {
           KMP_ASSERT(al != al->fb_data);
           al = al->fb_data;
           return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
@@ -1491,12 +1514,12 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
         // pool has enough space
         ptr = kmp_mk_alloc(*al->memkind, desc.size_a);
         if (ptr == NULL) {
-          if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
+          if (al->fb == omp_atv_default_mem_fb) {
             al = (kmp_allocator_t *)omp_default_mem_alloc;
             ptr = kmp_mk_alloc(*mk_default, desc.size_a);
-          } else if (al->fb == OMP_ATV_ABORT_FB) {
+          } else if (al->fb == omp_atv_abort_fb) {
             KMP_ASSERT(0); // abort fallback requested
-          } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
+          } else if (al->fb == omp_atv_allocator_fb) {
             KMP_ASSERT(al != al->fb_data);
             al = al->fb_data;
             return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
@@ -1507,12 +1530,12 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
       // custom allocator, pool size not requested
       ptr = kmp_mk_alloc(*al->memkind, desc.size_a);
       if (ptr == NULL) {
-        if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
+        if (al->fb == omp_atv_default_mem_fb) {
           al = (kmp_allocator_t *)omp_default_mem_alloc;
           ptr = kmp_mk_alloc(*mk_default, desc.size_a);
-        } else if (al->fb == OMP_ATV_ABORT_FB) {
+        } else if (al->fb == omp_atv_abort_fb) {
           KMP_ASSERT(0); // abort fallback requested
-        } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
+        } else if (al->fb == omp_atv_allocator_fb) {
           KMP_ASSERT(al != al->fb_data);
           al = al->fb_data;
           return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
@@ -1523,6 +1546,8 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
     // pre-defined allocator
     if (allocator == omp_high_bw_mem_alloc) {
       // ptr = NULL;
+    } else if (allocator == omp_large_cap_mem_alloc) {
+      // warnings?
     } else {
       ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
     }
@@ -1533,12 +1558,12 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
     if (used + desc.size_a > al->pool_size) {
       // not enough space, need to go fallback path
       KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
-      if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
+      if (al->fb == omp_atv_default_mem_fb) {
         al = (kmp_allocator_t *)omp_default_mem_alloc;
         ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
-      } else if (al->fb == OMP_ATV_ABORT_FB) {
+      } else if (al->fb == omp_atv_abort_fb) {
         KMP_ASSERT(0); // abort fallback requested
-      } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
+      } else if (al->fb == omp_atv_allocator_fb) {
         KMP_ASSERT(al != al->fb_data);
         al = al->fb_data;
         return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
@@ -1546,14 +1571,14 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
     } else {
       // pool has enough space
       ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
-      if (ptr == NULL && al->fb == OMP_ATV_ABORT_FB) {
+      if (ptr == NULL && al->fb == omp_atv_abort_fb) {
         KMP_ASSERT(0); // abort fallback requested
       } // no sense to look for another fallback because of same internal alloc
     }
   } else {
     // custom allocator, pool size not requested
     ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
-    if (ptr == NULL && al->fb == OMP_ATV_ABORT_FB) {
+    if (ptr == NULL && al->fb == omp_atv_abort_fb) {
       KMP_ASSERT(0); // abort fallback requested
     } // no sense to look for another fallback because of same internal alloc
   }
@@ -1575,6 +1600,80 @@ void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
   return desc.ptr_align;
 }
 
+void *__kmpc_calloc(int gtid, size_t nmemb, size_t size,
+                    omp_allocator_handle_t allocator) {
+  void *ptr = NULL;
+  kmp_allocator_t *al;
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  if (allocator == omp_null_allocator)
+    allocator = __kmp_threads[gtid]->th.th_def_allocator;
+
+  KE_TRACE(25, ("__kmpc_calloc: T#%d (%d, %d, %p)\n", gtid, (int)nmemb,
+                (int)size, allocator));
+
+  al = RCAST(kmp_allocator_t *, CCAST(omp_allocator_handle_t, allocator));
+
+  if (nmemb == 0 || size == 0)
+    return ptr;
+
+  if ((SIZE_MAX - sizeof(kmp_mem_desc_t)) / size < nmemb) {
+    if (al->fb == omp_atv_abort_fb) {
+      KMP_ASSERT(0);
+    }
+    return ptr;
+  }
+
+  ptr = __kmpc_alloc(gtid, nmemb * size, allocator);
+
+  if (ptr) {
+    memset(ptr, 0x00, nmemb * size);
+  }
+  KE_TRACE(25, ("__kmpc_calloc returns %p, T#%d\n", ptr, gtid));
+  return ptr;
+}
+
+void *__kmpc_realloc(int gtid, void *ptr, size_t size,
+                     omp_allocator_handle_t allocator,
+                     omp_allocator_handle_t free_allocator) {
+  void *nptr = NULL;
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  if (size == 0) {
+    if (ptr != NULL)
+      __kmpc_free(gtid, ptr, free_allocator);
+    return nptr;
+  }
+
+  KE_TRACE(25, ("__kmpc_realloc: T#%d (%p, %d, %p, %p)\n", gtid, ptr, (int)size,
+                allocator, free_allocator));
+
+  nptr = __kmpc_alloc(gtid, size, allocator);
+
+  if (nptr != NULL && ptr != NULL) {
+    kmp_mem_desc_t desc;
+    kmp_uintptr_t addr_align; // address to return to caller
+    kmp_uintptr_t addr_descr; // address of memory block descriptor
+
+    addr_align = (kmp_uintptr_t)ptr;
+    addr_descr = addr_align - sizeof(kmp_mem_desc_t);
+    desc = *((kmp_mem_desc_t *)addr_descr); // read descriptor
+
+    KMP_DEBUG_ASSERT(desc.ptr_align == ptr);
+    KMP_DEBUG_ASSERT(desc.size_orig > 0);
+    KMP_DEBUG_ASSERT(desc.size_orig < desc.size_a);
+    KMP_MEMCPY((char *)nptr, (char *)ptr,
+               (size_t)((size < desc.size_orig) ? size : desc.size_orig));
+  }
+
+  if (nptr != NULL) {
+    __kmpc_free(gtid, ptr, free_allocator);
+  }
+
+  KE_TRACE(25, ("__kmpc_realloc returns %p, T#%d\n", nptr, gtid));
+  return nptr;
+}
+
 void __kmpc_free(int gtid, void *ptr, const omp_allocator_handle_t allocator) {
   KE_TRACE(25, ("__kmpc_free: T#%d free(%p,%p)\n", gtid, ptr, allocator));
   if (ptr == NULL)
@@ -1604,6 +1703,8 @@ void __kmpc_free(int gtid, void *ptr, const omp_allocator_handle_t allocator) {
       // pre-defined allocator
       if (oal == omp_high_bw_mem_alloc && mk_hbw_preferred) {
         kmp_mk_free(*mk_hbw_preferred, desc.ptr_alloc);
+      } else if (oal == omp_large_cap_mem_alloc && mk_dax_kmem_all) {
+        kmp_mk_free(*mk_dax_kmem_all, desc.ptr_alloc);
       } else {
         kmp_mk_free(*mk_default, desc.ptr_alloc);
       }
@@ -1812,8 +1913,7 @@ void ___kmp_free(void *ptr KMP_SRC_LOC_DECL) {
 
 void *___kmp_fast_allocate(kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL) {
   void *ptr;
-  int num_lines;
-  int idx;
+  size_t num_lines, idx;
   int index;
   void *alloc_ptr;
   size_t alloc_size;
@@ -1961,7 +2061,7 @@ void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL) {
         this_thr->th.th_free_lists[index].th_free_list_other = ptr;
       } else {
         // either queue blocks owner is changing or size limit exceeded
-        // return old queue to allocating thread (q_th) synchroneously,
+        // return old queue to allocating thread (q_th) synchronously,
         // and start new list for alloc_thr's tasks
         void *old_ptr;
         void *tail = head;
diff --git a/runtime/src/kmp_atomic.cpp b/runtime/src/kmp_atomic.cpp
index f1ee3d2cd..a9d5257ab 100644
--- a/runtime/src/kmp_atomic.cpp
+++ b/runtime/src/kmp_atomic.cpp
@@ -141,7 +141,7 @@ Full list of functions
 ======================
 This leads to the generation of 376 atomic functions, as follows.
 
-Functons for integers
+Functions for integers
 ---------------------
 There are versions here for integers of size 1,2,4 and 8 bytes both signed and
 unsigned (where that matters).
@@ -483,8 +483,8 @@ Functions for Complex types
 ---------------------------
 Functions for complex types whose component floating point variables are of size
 4,8,10 or 16 bytes. The names here are based on the size of the component float,
-*not* the size of the complex type. So `__kmpc_atomc_cmplx8_add` is an operation
-on a `complex<double>` or `complex(kind=8)`, *not* `complex<float>`.
+*not* the size of the complex type. So `__kmpc_atomic_cmplx8_add` is an
+operation on a `complex<double>` or `complex(kind=8)`, *not* `complex<float>`.
 
 @code
     __kmpc_atomic_cmplx4_add
@@ -606,17 +606,17 @@ kmp_atomic_lock_t __kmp_atomic_lock_32c;
 
 #if (KMP_ARCH_X86) && KMP_HAVE_QUAD
 
-static inline void operator+=(Quad_a4_t &lhs, Quad_a4_t &rhs) {
-  lhs.q += rhs.q;
+static inline Quad_a4_t operator+(Quad_a4_t &lhs, Quad_a4_t &rhs) {
+  return lhs.q + rhs.q;
 }
-static inline void operator-=(Quad_a4_t &lhs, Quad_a4_t &rhs) {
-  lhs.q -= rhs.q;
+static inline Quad_a4_t operator-(Quad_a4_t &lhs, Quad_a4_t &rhs) {
+  return lhs.q - rhs.q;
 }
-static inline void operator*=(Quad_a4_t &lhs, Quad_a4_t &rhs) {
-  lhs.q *= rhs.q;
+static inline Quad_a4_t operator*(Quad_a4_t &lhs, Quad_a4_t &rhs) {
+  return lhs.q * rhs.q;
 }
-static inline void operator/=(Quad_a4_t &lhs, Quad_a4_t &rhs) {
-  lhs.q /= rhs.q;
+static inline Quad_a4_t operator/(Quad_a4_t &lhs, Quad_a4_t &rhs) {
+  return lhs.q / rhs.q;
 }
 static inline bool operator<(Quad_a4_t &lhs, Quad_a4_t &rhs) {
   return lhs.q < rhs.q;
@@ -625,17 +625,17 @@ static inline bool operator>(Quad_a4_t &lhs, Quad_a4_t &rhs) {
   return lhs.q > rhs.q;
 }
 
-static inline void operator+=(Quad_a16_t &lhs, Quad_a16_t &rhs) {
-  lhs.q += rhs.q;
+static inline Quad_a16_t operator+(Quad_a16_t &lhs, Quad_a16_t &rhs) {
+  return lhs.q + rhs.q;
 }
-static inline void operator-=(Quad_a16_t &lhs, Quad_a16_t &rhs) {
-  lhs.q -= rhs.q;
+static inline Quad_a16_t operator-(Quad_a16_t &lhs, Quad_a16_t &rhs) {
+  return lhs.q - rhs.q;
 }
-static inline void operator*=(Quad_a16_t &lhs, Quad_a16_t &rhs) {
-  lhs.q *= rhs.q;
+static inline Quad_a16_t operator*(Quad_a16_t &lhs, Quad_a16_t &rhs) {
+  return lhs.q * rhs.q;
 }
-static inline void operator/=(Quad_a16_t &lhs, Quad_a16_t &rhs) {
-  lhs.q /= rhs.q;
+static inline Quad_a16_t operator/(Quad_a16_t &lhs, Quad_a16_t &rhs) {
+  return lhs.q / rhs.q;
 }
 static inline bool operator<(Quad_a16_t &lhs, Quad_a16_t &rhs) {
   return lhs.q < rhs.q;
@@ -644,34 +644,38 @@ static inline bool operator>(Quad_a16_t &lhs, Quad_a16_t &rhs) {
   return lhs.q > rhs.q;
 }
 
-static inline void operator+=(kmp_cmplx128_a4_t &lhs, kmp_cmplx128_a4_t &rhs) {
-  lhs.q += rhs.q;
+static inline kmp_cmplx128_a4_t operator+(kmp_cmplx128_a4_t &lhs,
+                                          kmp_cmplx128_a4_t &rhs) {
+  return lhs.q + rhs.q;
 }
-static inline void operator-=(kmp_cmplx128_a4_t &lhs, kmp_cmplx128_a4_t &rhs) {
-  lhs.q -= rhs.q;
+static inline kmp_cmplx128_a4_t operator-(kmp_cmplx128_a4_t &lhs,
+                                          kmp_cmplx128_a4_t &rhs) {
+  return lhs.q - rhs.q;
 }
-static inline void operator*=(kmp_cmplx128_a4_t &lhs, kmp_cmplx128_a4_t &rhs) {
-  lhs.q *= rhs.q;
+static inline kmp_cmplx128_a4_t operator*(kmp_cmplx128_a4_t &lhs,
+                                          kmp_cmplx128_a4_t &rhs) {
+  return lhs.q * rhs.q;
 }
-static inline void operator/=(kmp_cmplx128_a4_t &lhs, kmp_cmplx128_a4_t &rhs) {
-  lhs.q /= rhs.q;
+static inline kmp_cmplx128_a4_t operator/(kmp_cmplx128_a4_t &lhs,
+                                          kmp_cmplx128_a4_t &rhs) {
+  return lhs.q / rhs.q;
 }
 
-static inline void operator+=(kmp_cmplx128_a16_t &lhs,
-                              kmp_cmplx128_a16_t &rhs) {
-  lhs.q += rhs.q;
+static inline kmp_cmplx128_a16_t operator+(kmp_cmplx128_a16_t &lhs,
+                                           kmp_cmplx128_a16_t &rhs) {
+  return lhs.q + rhs.q;
 }
-static inline void operator-=(kmp_cmplx128_a16_t &lhs,
-                              kmp_cmplx128_a16_t &rhs) {
-  lhs.q -= rhs.q;
+static inline kmp_cmplx128_a16_t operator-(kmp_cmplx128_a16_t &lhs,
+                                           kmp_cmplx128_a16_t &rhs) {
+  return lhs.q - rhs.q;
 }
-static inline void operator*=(kmp_cmplx128_a16_t &lhs,
-                              kmp_cmplx128_a16_t &rhs) {
-  lhs.q *= rhs.q;
+static inline kmp_cmplx128_a16_t operator*(kmp_cmplx128_a16_t &lhs,
+                                           kmp_cmplx128_a16_t &rhs) {
+  return lhs.q * rhs.q;
 }
-static inline void operator/=(kmp_cmplx128_a16_t &lhs,
-                              kmp_cmplx128_a16_t &rhs) {
-  lhs.q /= rhs.q;
+static inline kmp_cmplx128_a16_t operator/(kmp_cmplx128_a16_t &lhs,
+                                           kmp_cmplx128_a16_t &rhs) {
+  return lhs.q / rhs.q;
 }
 
 #endif // (KMP_ARCH_X86) && KMP_HAVE_QUAD
@@ -726,6 +730,11 @@ static inline void operator/=(kmp_cmplx128_a16_t &lhs,
                                                                                \
   __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
 
+#define OP_UPDATE_CRITICAL(TYPE, OP, LCK_ID)                                   \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  (*lhs) = (TYPE)((*lhs)OP((TYPE)rhs));                                        \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
+
 // ------------------------------------------------------------------------
 // For GNU compatibility, we may need to use a critical section,
 // even though it is not required by the ISA.
@@ -755,8 +764,16 @@ static inline void operator/=(kmp_cmplx128_a16_t &lhs,
     OP_CRITICAL(OP, 0);                                                        \
     return;                                                                    \
   }
+
+#define OP_UPDATE_GOMP_CRITICAL(TYPE, OP, FLAG)                                \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_UPDATE_CRITICAL(TYPE, OP, 0);                                           \
+    return;                                                                    \
+  }
 #else
 #define OP_GOMP_CRITICAL(OP, FLAG)
+#define OP_UPDATE_GOMP_CRITICAL(TYPE, OP, FLAG)
 #endif /* KMP_GOMP_COMPAT */
 
 #if KMP_MIC
@@ -774,14 +791,14 @@ static inline void operator/=(kmp_cmplx128_a16_t &lhs,
   {                                                                            \
     TYPE old_value, new_value;                                                 \
     old_value = *(TYPE volatile *)lhs;                                         \
-    new_value = old_value OP rhs;                                              \
+    new_value = (TYPE)(old_value OP((TYPE)rhs));                               \
     while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
         (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
         *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
       KMP_DO_PAUSE;                                                            \
                                                                                \
       old_value = *(TYPE volatile *)lhs;                                       \
-      new_value = old_value OP rhs;                                            \
+      new_value = (TYPE)(old_value OP((TYPE)rhs));                             \
     }                                                                          \
   }
 
@@ -802,14 +819,14 @@ static inline void operator/=(kmp_cmplx128_a16_t &lhs,
     old_value.vvv = (kmp_int##BITS *)&old_value.cmp;                           \
     new_value.vvv = (kmp_int##BITS *)&new_value.cmp;                           \
     *old_value.vvv = *(volatile kmp_int##BITS *)lhs;                           \
-    new_value.cmp = old_value.cmp OP rhs;                                      \
+    new_value.cmp = (TYPE)(old_value.cmp OP rhs);                              \
     while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
         (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) old_value.vvv,   \
         *VOLATILE_CAST(kmp_int##BITS *) new_value.vvv)) {                      \
       KMP_DO_PAUSE;                                                            \
                                                                                \
       *old_value.vvv = *(volatile kmp_int##BITS *)lhs;                         \
-      new_value.cmp = old_value.cmp OP rhs;                                    \
+      new_value.cmp = (TYPE)(old_value.cmp OP rhs);                            \
     }                                                                          \
   }
 // end of the first part of the workaround for C78287
@@ -822,7 +839,7 @@ static inline void operator/=(kmp_cmplx128_a16_t &lhs,
 #define ATOMIC_FIXED_ADD(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,         \
                          GOMP_FLAG)                                            \
   ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
-  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
   /* OP used as a sign for subtraction: (lhs-rhs) --> (lhs+-rhs) */            \
   KMP_TEST_THEN_ADD##BITS(lhs, OP rhs);                                        \
   }
@@ -830,7 +847,7 @@ static inline void operator/=(kmp_cmplx128_a16_t &lhs,
 #define ATOMIC_CMPXCHG(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,           \
                        GOMP_FLAG)                                              \
   ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
-  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
   OP_CMPXCHG(TYPE, BITS, OP)                                                   \
   }
 #if USE_CMPXCHG_FIX
@@ -839,7 +856,7 @@ static inline void operator/=(kmp_cmplx128_a16_t &lhs,
 #define ATOMIC_CMPXCHG_WORKAROUND(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID,      \
                                   MASK, GOMP_FLAG)                             \
   ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
-  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
   OP_CMPXCHG_WORKAROUND(TYPE, BITS, OP)                                        \
   }
 // end of the second part of the workaround for C78287
@@ -851,25 +868,27 @@ static inline void operator/=(kmp_cmplx128_a16_t &lhs,
 #define ATOMIC_FIXED_ADD(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,         \
                          GOMP_FLAG)                                            \
   ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
-  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
   if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
     /* OP used as a sign for subtraction: (lhs-rhs) --> (lhs+-rhs) */          \
     KMP_TEST_THEN_ADD##BITS(lhs, OP rhs);                                      \
   } else {                                                                     \
     KMP_CHECK_GTID;                                                            \
-    OP_CRITICAL(OP## =, LCK_ID) /* unaligned address - use critical */         \
+    OP_UPDATE_CRITICAL(TYPE, OP,                                               \
+                       LCK_ID) /* unaligned address - use critical */          \
   }                                                                            \
   }
 // -------------------------------------------------------------------------
 #define ATOMIC_CMPXCHG(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,           \
                        GOMP_FLAG)                                              \
   ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
-  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
   if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
     OP_CMPXCHG(TYPE, BITS, OP) /* aligned address */                           \
   } else {                                                                     \
     KMP_CHECK_GTID;                                                            \
-    OP_CRITICAL(OP## =, LCK_ID) /* unaligned address - use critical */         \
+    OP_UPDATE_CRITICAL(TYPE, OP,                                               \
+                       LCK_ID) /* unaligned address - use critical */          \
   }                                                                            \
   }
 #if USE_CMPXCHG_FIX
@@ -878,12 +897,13 @@ static inline void operator/=(kmp_cmplx128_a16_t &lhs,
 #define ATOMIC_CMPXCHG_WORKAROUND(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID,      \
                                   MASK, GOMP_FLAG)                             \
   ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
-  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
   if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
     OP_CMPXCHG(TYPE, BITS, OP) /* aligned address */                           \
   } else {                                                                     \
     KMP_CHECK_GTID;                                                            \
-    OP_CRITICAL(OP## =, LCK_ID) /* unaligned address - use critical */         \
+    OP_UPDATE_CRITICAL(TYPE, OP,                                               \
+                       LCK_ID) /* unaligned address - use critical */          \
   }                                                                            \
   }
 // end of the second part of the workaround for C78287
@@ -1200,8 +1220,8 @@ MIN_MAX_CRITICAL(float16, min_a16, Quad_a16_t, >, 16r,
 // OP ignored for critical sections, ^=~ used instead
 #define ATOMIC_CRIT_EQV(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)           \
   ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
-  OP_GOMP_CRITICAL(^= ~, GOMP_FLAG) /* send assignment */                      \
-  OP_CRITICAL(^= ~, LCK_ID) /* send assignment and complement */               \
+  OP_GOMP_CRITICAL(^= (TYPE) ~, GOMP_FLAG) /* send assignment */               \
+  OP_CRITICAL(^= (TYPE) ~, LCK_ID) /* send assignment and complement */        \
   }
 
 // ------------------------------------------------------------------------
@@ -1211,7 +1231,7 @@ MIN_MAX_CRITICAL(float16, min_a16, Quad_a16_t, >, 16r,
 #define ATOMIC_CMPX_EQV(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,          \
                         GOMP_FLAG)                                             \
   ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
-  OP_GOMP_CRITICAL(^= ~, GOMP_FLAG) /* send assignment */                      \
+  OP_GOMP_CRITICAL(^= (TYPE) ~, GOMP_FLAG) /* send assignment */               \
   OP_CMPXCHG(TYPE, BITS, OP)                                                   \
   }
 // ------------------------------------------------------------------------
@@ -1221,12 +1241,12 @@ MIN_MAX_CRITICAL(float16, min_a16, Quad_a16_t, >, 16r,
 #define ATOMIC_CMPX_EQV(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,          \
                         GOMP_FLAG)                                             \
   ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
-  OP_GOMP_CRITICAL(^= ~, GOMP_FLAG)                                            \
+  OP_GOMP_CRITICAL(^= (TYPE) ~, GOMP_FLAG)                                     \
   if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
     OP_CMPXCHG(TYPE, BITS, OP) /* aligned address */                           \
   } else {                                                                     \
     KMP_CHECK_GTID;                                                            \
-    OP_CRITICAL(^= ~, LCK_ID) /* unaligned address - use critical */           \
+    OP_CRITICAL(^= (TYPE) ~, LCK_ID) /* unaligned address - use critical */    \
   }                                                                            \
   }
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
@@ -1256,8 +1276,8 @@ ATOMIC_CMPX_EQV(fixed8, eqv, kmp_int64, 64, ^~, 8i, 7,
 //     LCK_ID  - lock identifier, used to possibly distinguish lock variable
 #define ATOMIC_CRITICAL(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)           \
   ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
-  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG) /* send assignment */                    \
-  OP_CRITICAL(OP## =, LCK_ID) /* send assignment */                            \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG) /* send assignment */           \
+  OP_UPDATE_CRITICAL(TYPE, OP, LCK_ID) /* send assignment */                   \
   }
 
 /* ------------------------------------------------------------------------- */
@@ -1354,22 +1374,23 @@ ATOMIC_CRITICAL(cmplx16, div_a16, kmp_cmplx128_a16_t, /, 32c,
 //     LCK_ID - lock identifier
 // Note: don't check gtid as it should always be valid
 // 1, 2-byte - expect valid parameter, other - check before this macro
-#define OP_CRITICAL_REV(OP, LCK_ID)                                            \
+#define OP_CRITICAL_REV(TYPE, OP, LCK_ID)                                      \
   __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
                                                                                \
-  (*lhs) = (rhs)OP(*lhs);                                                      \
+  (*lhs) = (TYPE)((rhs)OP(*lhs));                                              \
                                                                                \
   __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
 
 #ifdef KMP_GOMP_COMPAT
-#define OP_GOMP_CRITICAL_REV(OP, FLAG)                                         \
+#define OP_GOMP_CRITICAL_REV(TYPE, OP, FLAG)                                   \
   if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
     KMP_CHECK_GTID;                                                            \
-    OP_CRITICAL_REV(OP, 0);                                                    \
+    OP_CRITICAL_REV(TYPE, OP, 0);                                              \
     return;                                                                    \
   }
+
 #else
-#define OP_GOMP_CRITICAL_REV(OP, FLAG)
+#define OP_GOMP_CRITICAL_REV(TYPE, OP, FLAG)
 #endif /* KMP_GOMP_COMPAT */
 
 // Beginning of a definition (provides name, parameters, gebug trace)
@@ -1396,7 +1417,7 @@ ATOMIC_CRITICAL(cmplx16, div_a16, kmp_cmplx128_a16_t, /, 32c,
     TYPE old_value, new_value;                                                 \
     temp_val = *lhs;                                                           \
     old_value = temp_val;                                                      \
-    new_value = rhs OP old_value;                                              \
+    new_value = (TYPE)(rhs OP old_value);                                      \
     while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
         (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
         *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
@@ -1404,14 +1425,14 @@ ATOMIC_CRITICAL(cmplx16, div_a16, kmp_cmplx128_a16_t, /, 32c,
                                                                                \
       temp_val = *lhs;                                                         \
       old_value = temp_val;                                                    \
-      new_value = rhs OP old_value;                                            \
+      new_value = (TYPE)(rhs OP old_value);                                    \
     }                                                                          \
   }
 
 // -------------------------------------------------------------------------
 #define ATOMIC_CMPXCHG_REV(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, GOMP_FLAG)  \
   ATOMIC_BEGIN_REV(TYPE_ID, OP_ID, TYPE, void)                                 \
-  OP_GOMP_CRITICAL_REV(OP, GOMP_FLAG)                                          \
+  OP_GOMP_CRITICAL_REV(TYPE, OP, GOMP_FLAG)                                    \
   OP_CMPXCHG_REV(TYPE, BITS, OP)                                               \
   }
 
@@ -1500,8 +1521,8 @@ ATOMIC_CMPXCHG_REV(float8, sub, kmp_real64, 64, -, 8r,
 //     LCK_ID  - lock identifier, used to possibly distinguish lock variable
 #define ATOMIC_CRITICAL_REV(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)       \
   ATOMIC_BEGIN_REV(TYPE_ID, OP_ID, TYPE, void)                                 \
-  OP_GOMP_CRITICAL_REV(OP, GOMP_FLAG)                                          \
-  OP_CRITICAL_REV(OP, LCK_ID)                                                  \
+  OP_GOMP_CRITICAL_REV(TYPE, OP, GOMP_FLAG)                                    \
+  OP_CRITICAL_REV(TYPE, OP, LCK_ID)                                            \
   }
 
 /* ------------------------------------------------------------------------- */
@@ -1576,8 +1597,8 @@ ATOMIC_CRITICAL_REV(cmplx16, div_a16, kmp_cmplx128_a16_t, /, 32c,
 #define ATOMIC_CRITICAL_FP(TYPE_ID, TYPE, OP_ID, OP, RTYPE_ID, RTYPE, LCK_ID,  \
                            GOMP_FLAG)                                          \
   ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
-  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG) /* send assignment */                    \
-  OP_CRITICAL(OP## =, LCK_ID) /* send assignment */                            \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG) /* send assignment */           \
+  OP_UPDATE_CRITICAL(TYPE, OP, LCK_ID) /* send assignment */                   \
   }
 
 // -------------------------------------------------------------------------
@@ -1587,7 +1608,7 @@ ATOMIC_CRITICAL_REV(cmplx16, div_a16, kmp_cmplx128_a16_t, /, 32c,
 #define ATOMIC_CMPXCHG_MIX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID, RTYPE,    \
                            LCK_ID, MASK, GOMP_FLAG)                            \
   ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
-  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
   OP_CMPXCHG(TYPE, BITS, OP)                                                   \
   }
 // -------------------------------------------------------------------------
@@ -1597,12 +1618,13 @@ ATOMIC_CRITICAL_REV(cmplx16, div_a16, kmp_cmplx128_a16_t, /, 32c,
 #define ATOMIC_CMPXCHG_MIX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID, RTYPE,    \
                            LCK_ID, MASK, GOMP_FLAG)                            \
   ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
-  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
   if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
     OP_CMPXCHG(TYPE, BITS, OP) /* aligned address */                           \
   } else {                                                                     \
     KMP_CHECK_GTID;                                                            \
-    OP_CRITICAL(OP## =, LCK_ID) /* unaligned address - use critical */         \
+    OP_UPDATE_CRITICAL(TYPE, OP,                                               \
+                       LCK_ID) /* unaligned address - use critical */          \
   }                                                                            \
   }
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
@@ -1613,14 +1635,14 @@ ATOMIC_CRITICAL_REV(cmplx16, div_a16, kmp_cmplx128_a16_t, /, 32c,
 #define ATOMIC_CMPXCHG_REV_MIX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID,       \
                                RTYPE, LCK_ID, MASK, GOMP_FLAG)                 \
   ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
-  OP_GOMP_CRITICAL_REV(OP, GOMP_FLAG)                                          \
+  OP_GOMP_CRITICAL_REV(TYPE, OP, GOMP_FLAG)                                    \
   OP_CMPXCHG_REV(TYPE, BITS, OP)                                               \
   }
 #define ATOMIC_CRITICAL_REV_FP(TYPE_ID, TYPE, OP_ID, OP, RTYPE_ID, RTYPE,      \
                                LCK_ID, GOMP_FLAG)                              \
   ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
-  OP_GOMP_CRITICAL_REV(OP, GOMP_FLAG)                                          \
-  OP_CRITICAL_REV(OP, LCK_ID)                                                  \
+  OP_GOMP_CRITICAL_REV(TYPE, OP, GOMP_FLAG)                                    \
+  OP_CRITICAL_REV(TYPE, OP, LCK_ID)                                            \
   }
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
@@ -1812,7 +1834,7 @@ ATOMIC_CRITICAL_REV_FP(float10, long double, div_rev, /, fp, _Quad, 10r,
 #define ATOMIC_CMPXCHG_CMPLX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID, RTYPE,  \
                              LCK_ID, MASK, GOMP_FLAG)                          \
   ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
-  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
   OP_CMPXCHG_WORKAROUND(TYPE, BITS, OP)                                        \
   }
 // end of the second part of the workaround for C78287
@@ -1820,7 +1842,7 @@ ATOMIC_CRITICAL_REV_FP(float10, long double, div_rev, /, fp, _Quad, 10r,
 #define ATOMIC_CMPXCHG_CMPLX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID, RTYPE,  \
                              LCK_ID, MASK, GOMP_FLAG)                          \
   ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
-  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
   OP_CMPXCHG(TYPE, BITS, OP)                                                   \
   }
 #endif // USE_CMPXCHG_FIX
@@ -1830,12 +1852,13 @@ ATOMIC_CRITICAL_REV_FP(float10, long double, div_rev, /, fp, _Quad, 10r,
 #define ATOMIC_CMPXCHG_CMPLX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID, RTYPE,  \
                              LCK_ID, MASK, GOMP_FLAG)                          \
   ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
-  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
   if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
     OP_CMPXCHG(TYPE, BITS, OP) /* aligned address */                           \
   } else {                                                                     \
     KMP_CHECK_GTID;                                                            \
-    OP_CRITICAL(OP## =, LCK_ID) /* unaligned address - use critical */         \
+    OP_UPDATE_CRITICAL(TYPE, OP,                                               \
+                       LCK_ID) /* unaligned address - use critical */          \
   }                                                                            \
   }
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
@@ -2172,15 +2195,29 @@ ATOMIC_CRITICAL_WR(cmplx16, a16_wr, kmp_cmplx128_a16_t, =, 32c,
   __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
   return new_value;
 
+#define OP_UPDATE_CRITICAL_CPT(TYPE, OP, LCK_ID)                               \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  if (flag) {                                                                  \
+    (*lhs) = (TYPE)((*lhs)OP rhs);                                             \
+    new_value = (*lhs);                                                        \
+  } else {                                                                     \
+    new_value = (*lhs);                                                        \
+    (*lhs) = (TYPE)((*lhs)OP rhs);                                             \
+  }                                                                            \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  return new_value;
+
 // ------------------------------------------------------------------------
 #ifdef KMP_GOMP_COMPAT
-#define OP_GOMP_CRITICAL_CPT(OP, FLAG)                                         \
+#define OP_GOMP_CRITICAL_CPT(TYPE, OP, FLAG)                                   \
   if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
     KMP_CHECK_GTID;                                                            \
-    OP_CRITICAL_CPT(OP## =, 0);                                                \
+    OP_UPDATE_CRITICAL_CPT(TYPE, OP, 0);                                       \
   }
 #else
-#define OP_GOMP_CRITICAL_CPT(OP, FLAG)
+#define OP_GOMP_CRITICAL_CPT(TYPE, OP, FLAG)
 #endif /* KMP_GOMP_COMPAT */
 
 // ------------------------------------------------------------------------
@@ -2196,7 +2233,7 @@ ATOMIC_CRITICAL_WR(cmplx16, a16_wr, kmp_cmplx128_a16_t, =, 32c,
     TYPE old_value, new_value;                                                 \
     temp_val = *lhs;                                                           \
     old_value = temp_val;                                                      \
-    new_value = old_value OP rhs;                                              \
+    new_value = (TYPE)(old_value OP rhs);                                      \
     while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
         (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
         *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
@@ -2204,7 +2241,7 @@ ATOMIC_CRITICAL_WR(cmplx16, a16_wr, kmp_cmplx128_a16_t, =, 32c,
                                                                                \
       temp_val = *lhs;                                                         \
       old_value = temp_val;                                                    \
-      new_value = old_value OP rhs;                                            \
+      new_value = (TYPE)(old_value OP rhs);                                    \
     }                                                                          \
     if (flag) {                                                                \
       return new_value;                                                        \
@@ -2216,7 +2253,7 @@ ATOMIC_CRITICAL_WR(cmplx16, a16_wr, kmp_cmplx128_a16_t, =, 32c,
 #define ATOMIC_CMPXCHG_CPT(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)          \
   ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
   TYPE new_value;                                                              \
-  OP_GOMP_CRITICAL_CPT(OP, GOMP_FLAG)                                          \
+  OP_GOMP_CRITICAL_CPT(TYPE, OP, GOMP_FLAG)                                    \
   OP_CMPXCHG_CPT(TYPE, BITS, OP)                                               \
   }
 
@@ -2224,7 +2261,7 @@ ATOMIC_CRITICAL_WR(cmplx16, a16_wr, kmp_cmplx128_a16_t, =, 32c,
 #define ATOMIC_FIXED_ADD_CPT(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)        \
   ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
   TYPE old_value, new_value;                                                   \
-  OP_GOMP_CRITICAL_CPT(OP, GOMP_FLAG)                                          \
+  OP_GOMP_CRITICAL_CPT(TYPE, OP, GOMP_FLAG)                                    \
   /* OP used as a sign for subtraction: (lhs-rhs) --> (lhs+-rhs) */            \
   old_value = KMP_TEST_THEN_ADD##BITS(lhs, OP rhs);                            \
   if (flag) {                                                                  \
@@ -2375,7 +2412,7 @@ ATOMIC_CMPXCHG_CPT(float8, mul_cpt, kmp_real64, 64, *,
                                RTYPE, LCK_ID, MASK, GOMP_FLAG)                 \
   ATOMIC_BEGIN_CPT_MIX(TYPE_ID, OP_ID, TYPE, RTYPE_ID, RTYPE)                  \
   TYPE new_value;                                                              \
-  OP_GOMP_CRITICAL_CPT(OP, GOMP_FLAG)                                          \
+  OP_GOMP_CRITICAL_CPT(TYPE, OP, GOMP_FLAG)                                    \
   OP_CMPXCHG_CPT(TYPE, BITS, OP)                                               \
   }
 
@@ -2384,8 +2421,8 @@ ATOMIC_CMPXCHG_CPT(float8, mul_cpt, kmp_real64, 64, *,
                                 LCK_ID, GOMP_FLAG)                             \
   ATOMIC_BEGIN_CPT_MIX(TYPE_ID, OP_ID, TYPE, RTYPE_ID, RTYPE)                  \
   TYPE new_value;                                                              \
-  OP_GOMP_CRITICAL_CPT(OP, GOMP_FLAG) /* send assignment */                    \
-  OP_CRITICAL_CPT(OP## =, LCK_ID) /* send assignment */                        \
+  OP_GOMP_CRITICAL_CPT(TYPE, OP, GOMP_FLAG) /* send assignment */              \
+  OP_UPDATE_CRITICAL_CPT(TYPE, OP, LCK_ID) /* send assignment */               \
   }
 
 ATOMIC_CMPXCHG_CPT_MIX(fixed1, char, add_cpt, 8, +, fp, _Quad, 1i, 0,
@@ -2499,8 +2536,11 @@ ATOMIC_CRITICAL_CPT_MIX(float10, long double, div_cpt, /, fp, _Quad, 10r,
                                                                                \
   if (flag) {                                                                  \
     new_value OP rhs;                                                          \
-  } else                                                                       \
+    (*lhs) = new_value;                                                        \
+  } else {                                                                     \
     new_value = (*lhs);                                                        \
+    (*lhs) OP rhs;                                                             \
+  }                                                                            \
                                                                                \
   __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
 
@@ -2673,7 +2713,7 @@ MIN_MAX_CRITICAL_CPT(float16, min_a16_cpt, Quad_a16_t, >, 16r,
 #define ATOMIC_CMPX_EQV_CPT(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)         \
   ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
   TYPE new_value;                                                              \
-  OP_GOMP_CRITICAL_EQV_CPT(^= ~, GOMP_FLAG) /* send assignment */              \
+  OP_GOMP_CRITICAL_EQV_CPT(^= (TYPE) ~, GOMP_FLAG) /* send assignment */       \
   OP_CMPXCHG_CPT(TYPE, BITS, OP)                                               \
   }
 
@@ -2705,8 +2745,8 @@ ATOMIC_CMPX_EQV_CPT(fixed8, eqv_cpt, kmp_int64, 64, ^~,
 #define ATOMIC_CRITICAL_CPT(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)       \
   ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
   TYPE new_value;                                                              \
-  OP_GOMP_CRITICAL_CPT(OP, GOMP_FLAG) /* send assignment */                    \
-  OP_CRITICAL_CPT(OP## =, LCK_ID) /* send assignment */                        \
+  OP_GOMP_CRITICAL_CPT(TYPE, OP, GOMP_FLAG) /* send assignment */              \
+  OP_UPDATE_CRITICAL_CPT(TYPE, OP, LCK_ID) /* send assignment */               \
   }
 
 // ------------------------------------------------------------------------
@@ -2843,29 +2883,29 @@ ATOMIC_CRITICAL_CPT(cmplx16, div_a16_cpt, kmp_cmplx128_a16_t, /, 32c,
 //     LCK_ID - lock identifier
 // Note: don't check gtid as it should always be valid
 // 1, 2-byte - expect valid parameter, other - check before this macro
-#define OP_CRITICAL_CPT_REV(OP, LCK_ID)                                        \
+#define OP_CRITICAL_CPT_REV(TYPE, OP, LCK_ID)                                  \
   __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
                                                                                \
   if (flag) {                                                                  \
     /*temp_val = (*lhs);*/                                                     \
-    (*lhs) = (rhs)OP(*lhs);                                                    \
+    (*lhs) = (TYPE)((rhs)OP(*lhs));                                            \
     new_value = (*lhs);                                                        \
   } else {                                                                     \
     new_value = (*lhs);                                                        \
-    (*lhs) = (rhs)OP(*lhs);                                                    \
+    (*lhs) = (TYPE)((rhs)OP(*lhs));                                            \
   }                                                                            \
   __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
   return new_value;
 
 // ------------------------------------------------------------------------
 #ifdef KMP_GOMP_COMPAT
-#define OP_GOMP_CRITICAL_CPT_REV(OP, FLAG)                                     \
+#define OP_GOMP_CRITICAL_CPT_REV(TYPE, OP, FLAG)                               \
   if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
     KMP_CHECK_GTID;                                                            \
-    OP_CRITICAL_CPT_REV(OP, 0);                                                \
+    OP_CRITICAL_CPT_REV(TYPE, OP, 0);                                          \
   }
 #else
-#define OP_GOMP_CRITICAL_CPT_REV(OP, FLAG)
+#define OP_GOMP_CRITICAL_CPT_REV(TYPE, OP, FLAG)
 #endif /* KMP_GOMP_COMPAT */
 
 // ------------------------------------------------------------------------
@@ -2881,7 +2921,7 @@ ATOMIC_CRITICAL_CPT(cmplx16, div_a16_cpt, kmp_cmplx128_a16_t, /, 32c,
     TYPE old_value, new_value;                                                 \
     temp_val = *lhs;                                                           \
     old_value = temp_val;                                                      \
-    new_value = rhs OP old_value;                                              \
+    new_value = (TYPE)(rhs OP old_value);                                      \
     while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
         (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
         *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
@@ -2889,7 +2929,7 @@ ATOMIC_CRITICAL_CPT(cmplx16, div_a16_cpt, kmp_cmplx128_a16_t, /, 32c,
                                                                                \
       temp_val = *lhs;                                                         \
       old_value = temp_val;                                                    \
-      new_value = rhs OP old_value;                                            \
+      new_value = (TYPE)(rhs OP old_value);                                    \
     }                                                                          \
     if (flag) {                                                                \
       return new_value;                                                        \
@@ -2901,7 +2941,7 @@ ATOMIC_CRITICAL_CPT(cmplx16, div_a16_cpt, kmp_cmplx128_a16_t, /, 32c,
 #define ATOMIC_CMPXCHG_CPT_REV(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)      \
   ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
   TYPE new_value;                                                              \
-  OP_GOMP_CRITICAL_CPT_REV(OP, GOMP_FLAG)                                      \
+  OP_GOMP_CRITICAL_CPT_REV(TYPE, OP, GOMP_FLAG)                                \
   OP_CMPXCHG_CPT_REV(TYPE, BITS, OP)                                           \
   }
 
@@ -2973,8 +3013,8 @@ ATOMIC_CMPXCHG_CPT_REV(float8, sub_cpt_rev, kmp_real64, 64, -,
   ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
   TYPE new_value;                                                              \
   /*printf("__kmp_atomic_mode = %d\n", __kmp_atomic_mode);*/                   \
-  OP_GOMP_CRITICAL_CPT_REV(OP, GOMP_FLAG)                                      \
-  OP_CRITICAL_CPT_REV(OP, LCK_ID)                                              \
+  OP_GOMP_CRITICAL_CPT_REV(TYPE, OP, GOMP_FLAG)                                \
+  OP_CRITICAL_CPT_REV(TYPE, OP, LCK_ID)                                        \
   }
 
 /* ------------------------------------------------------------------------- */
@@ -3077,7 +3117,7 @@ ATOMIC_CRITICAL_CPT_REV(cmplx16, div_a16_cpt_rev, kmp_cmplx128_a16_t, /, 32c,
                                    RTYPE, LCK_ID, MASK, GOMP_FLAG)             \
   ATOMIC_BEGIN_CPT_MIX(TYPE_ID, OP_ID, TYPE, RTYPE_ID, RTYPE)                  \
   TYPE new_value;                                                              \
-  OP_GOMP_CRITICAL_CPT_REV(OP, GOMP_FLAG)                                      \
+  OP_GOMP_CRITICAL_CPT_REV(TYPE, OP, GOMP_FLAG)                                \
   OP_CMPXCHG_CPT_REV(TYPE, BITS, OP)                                           \
   }
 
@@ -3086,8 +3126,8 @@ ATOMIC_CRITICAL_CPT_REV(cmplx16, div_a16_cpt_rev, kmp_cmplx128_a16_t, /, 32c,
                                     LCK_ID, GOMP_FLAG)                         \
   ATOMIC_BEGIN_CPT_MIX(TYPE_ID, OP_ID, TYPE, RTYPE_ID, RTYPE)                  \
   TYPE new_value;                                                              \
-  OP_GOMP_CRITICAL_CPT_REV(OP, GOMP_FLAG) /* send assignment */                \
-  OP_CRITICAL_CPT_REV(OP, LCK_ID) /* send assignment */                        \
+  OP_GOMP_CRITICAL_CPT_REV(TYPE, OP, GOMP_FLAG) /* send assignment */          \
+  OP_CRITICAL_CPT_REV(TYPE, OP, LCK_ID) /* send assignment */                  \
   }
 
 ATOMIC_CMPXCHG_CPT_REV_MIX(fixed1, char, sub_cpt_rev, 8, -, fp, _Quad, 1i, 0,
diff --git a/runtime/src/kmp_atomic.h b/runtime/src/kmp_atomic.h
index bb01c3164..8f70928ca 100644
--- a/runtime/src/kmp_atomic.h
+++ b/runtime/src/kmp_atomic.h
@@ -39,7 +39,7 @@
 #define KMP_DO_ALIGN(alignment) /* Nothing */
 #endif
 
-#if (_MSC_VER < 1600) && defined(_DEBUG)
+#if defined(_MSC_VER) && (_MSC_VER < 1600) && defined(_DEBUG)
 // Workaround for the problem of _DebugHeapTag unresolved external.
 // This problem prevented to use our static debug library for C tests
 // compiled with /MDd option (the library itself built with /MTd),
diff --git a/runtime/src/kmp_barrier.cpp b/runtime/src/kmp_barrier.cpp
index e17986b16..d69c22384 100644
--- a/runtime/src/kmp_barrier.cpp
+++ b/runtime/src/kmp_barrier.cpp
@@ -15,9 +15,7 @@
 #include "kmp_itt.h"
 #include "kmp_os.h"
 #include "kmp_stats.h"
-#if OMPT_SUPPORT
 #include "ompt-specific.h"
-#endif
 
 #if KMP_MIC
 #include <immintrin.h>
@@ -39,6 +37,8 @@
 #define ngo_sync() ((void)0)
 #endif /* KMP_MIC && USE_NGO_STORES */
 
+#if !KMP_USE_ABT
+
 void __kmp_print_structure(void); // Forward declaration
 
 // ---------------------------- Barrier Algorithms ----------------------------
@@ -80,7 +80,7 @@ static bool __kmp_linear_barrier_gather_template(
        is valid any more - it could be deallocated by the master thread at any
        time. */
     ANNOTATE_BARRIER_BEGIN(this_thr);
-    kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
+    kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
     flag.release();
   } else {
     kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
@@ -103,14 +103,14 @@ static bool __kmp_linear_barrier_gather_template(
                     &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
 
       // Wait for worker thread to arrive
-      kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
-                       new_state);
       if (cancellable) {
-        bool cancelled = flag.wait_cancellable_nosleep(
-            this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
-        if (cancelled)
+        kmp_flag_64<true, false> flag(
+            &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
+        if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
           return true;
       } else {
+        kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
+                           new_state);
         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
       }
       ANNOTATE_BARRIER_END(other_threads[i]);
@@ -128,8 +128,11 @@ static bool __kmp_linear_barrier_gather_template(
                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
                   team->t.t_id, i));
         ANNOTATE_REDUCE_AFTER(reduce);
+        OMPT_REDUCTION_DECL(this_thr, gtid);
+        OMPT_REDUCTION_BEGIN;
         (*reduce)(this_thr->th.th_local.reduce_data,
                   other_threads[i]->th.th_local.reduce_data);
+        OMPT_REDUCTION_END;
         ANNOTATE_REDUCE_BEFORE(reduce);
         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
       }
@@ -202,7 +205,7 @@ static bool __kmp_linear_barrier_release_template(
              other_threads[i]->th.th_bar[bt].bb.b_go,
              other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
         ANNOTATE_BARRIER_BEGIN(other_threads[i]);
-        kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
+        kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
                          other_threads[i]);
         flag.release();
       }
@@ -210,14 +213,12 @@ static bool __kmp_linear_barrier_release_template(
   } else { // Wait for the MASTER thread to release us
     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
                   gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
-    kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
     if (cancellable) {
-      bool cancelled = flag.wait_cancellable_nosleep(
-          this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
-      if (cancelled) {
+      kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+      if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
         return true;
-      }
     } else {
+      kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
     }
     ANNOTATE_BARRIER_END(this_thr);
@@ -338,7 +339,7 @@ __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
       // Wait for child to arrive
-      kmp_flag_64 flag(&child_bar->b_arrived, new_state);
+      kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
       flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
       ANNOTATE_BARRIER_END(child_thr);
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
@@ -355,8 +356,11 @@ __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                   team->t.t_id, child_tid));
         ANNOTATE_REDUCE_AFTER(reduce);
+        OMPT_REDUCTION_DECL(this_thr, gtid);
+        OMPT_REDUCTION_BEGIN;
         (*reduce)(this_thr->th.th_local.reduce_data,
                   child_thr->th.th_local.reduce_data);
+        OMPT_REDUCTION_END;
         ANNOTATE_REDUCE_BEFORE(reduce);
         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
       }
@@ -380,7 +384,7 @@ __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
        is valid any more - it could be deallocated by the master thread at any
        time.  */
     ANNOTATE_BARRIER_BEGIN(this_thr);
-    kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
+    kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
     flag.release();
   } else {
     // Need to update the team arrived pointer if we are the master thread
@@ -416,7 +420,7 @@ static void __kmp_tree_barrier_release(
     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
     // Wait for parent thread to release us
-    kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+    kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
     ANNOTATE_BARRIER_END(this_thr);
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
@@ -494,7 +498,7 @@ static void __kmp_tree_barrier_release(
                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
       // Release child from barrier
       ANNOTATE_BARRIER_BEGIN(child_thr);
-      kmp_flag_64 flag(&child_bar->b_go, child_thr);
+      kmp_flag_64<> flag(&child_bar->b_go, child_thr);
       flag.release();
       child++;
       child_tid++;
@@ -536,7 +540,7 @@ __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
 #endif
   /* Perform a hypercube-embedded tree gather to wait until all of the threads
      have arrived, and reduce any required data as we go.  */
-  kmp_flag_64 p_flag(&thr_bar->b_arrived);
+  kmp_flag_64<> p_flag(&thr_bar->b_arrived);
   for (level = 0, offset = 1; offset < num_threads;
        level += branch_bits, offset <<= branch_bits) {
     kmp_uint32 child;
@@ -545,6 +549,7 @@ __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
     if (((tid >> level) & (branch_factor - 1)) != 0) {
       kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
 
+      KMP_MB(); // Synchronize parent and child threads.
       KA_TRACE(20,
                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
                 "arrived(%p): %llu => %llu\n",
@@ -583,9 +588,10 @@ __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
       // Wait for child to arrive
-      kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
+      kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
       c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
       ANNOTATE_BARRIER_END(child_thr);
+      KMP_MB(); // Synchronize parent and child threads.
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
       // Barrier imbalance - write min of the thread time and a child time to
       // the thread.
@@ -600,8 +606,11 @@ __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                   team->t.t_id, child_tid));
         ANNOTATE_REDUCE_AFTER(reduce);
+        OMPT_REDUCTION_DECL(this_thr, gtid);
+        OMPT_REDUCTION_BEGIN;
         (*reduce)(this_thr->th.th_local.reduce_data,
                   child_thr->th.th_local.reduce_data);
+        OMPT_REDUCTION_END;
         ANNOTATE_REDUCE_BEFORE(reduce);
         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
       }
@@ -661,7 +670,7 @@ static void __kmp_hyper_barrier_release(
     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
     // Wait for parent thread to release us
-    kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+    kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
     ANNOTATE_BARRIER_END(this_thr);
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
@@ -763,7 +772,7 @@ static void __kmp_hyper_barrier_release(
              child_bar->b_go + KMP_BARRIER_STATE_BUMP));
         // Release child from barrier
         ANNOTATE_BARRIER_BEGIN(child_thr);
-        kmp_flag_64 flag(&child_bar->b_go, child_thr);
+        kmp_flag_64<> flag(&child_bar->b_go, child_thr);
         flag.release();
       }
     }
@@ -821,8 +830,8 @@ static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
           thr_bar->parent_tid = 0;
           thr_bar->my_level = d;
           break;
-        } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
-                   0) { // TODO: can we make this op faster?
+        } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
+          // TODO: can we make the above op faster?
           // thread is not a subtree root at next level, so this is max
           thr_bar->parent_tid = tid - rem;
           thr_bar->my_level = d;
@@ -831,7 +840,9 @@ static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
         ++d;
       }
     }
-    thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
+    __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
+                            (thr_bar->skip_per_level[thr_bar->my_level])),
+                       &(thr_bar->offset));
     thr_bar->old_tid = tid;
     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
     thr_bar->team = team;
@@ -850,7 +861,7 @@ static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
     if (thr_bar->my_level == 0)
       thr_bar->leaf_kids = 0;
     if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
-      thr_bar->leaf_kids = nproc - tid - 1;
+      __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
     thr_bar->leaf_state = 0;
     for (int i = 0; i < thr_bar->leaf_kids; ++i)
       ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
@@ -908,10 +919,12 @@ static void __kmp_hierarchical_barrier_gather(
         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
                       "for leaf kids\n",
                       gtid, team->t.t_id, tid));
-        kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
+        kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
         if (reduce) {
           ANNOTATE_REDUCE_AFTER(reduce);
+          OMPT_REDUCTION_DECL(this_thr, gtid);
+          OMPT_REDUCTION_BEGIN;
           for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
                ++child_tid) {
             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
@@ -923,6 +936,7 @@ static void __kmp_hierarchical_barrier_gather(
             (*reduce)(this_thr->th.th_local.reduce_data,
                       other_threads[child_tid]->th.th_local.reduce_data);
           }
+          OMPT_REDUCTION_END;
           ANNOTATE_REDUCE_BEFORE(reduce);
           ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
         }
@@ -945,7 +959,7 @@ static void __kmp_hierarchical_barrier_gather(
                         gtid, team->t.t_id, tid,
                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
                         child_tid, &child_bar->b_arrived, new_state));
-          kmp_flag_64 flag(&child_bar->b_arrived, new_state);
+          kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
           ANNOTATE_BARRIER_END(child_thr);
           if (reduce) {
@@ -978,7 +992,7 @@ static void __kmp_hierarchical_barrier_gather(
                         gtid, team->t.t_id, tid,
                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
                         child_tid, &child_bar->b_arrived, new_state));
-          kmp_flag_64 flag(&child_bar->b_arrived, new_state);
+          kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
           ANNOTATE_BARRIER_END(child_thr);
           if (reduce) {
@@ -1013,12 +1027,14 @@ static void __kmp_hierarchical_barrier_gather(
         !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
       // flag; release it
       ANNOTATE_BARRIER_BEGIN(this_thr);
-      kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
+      kmp_flag_64<> flag(&thr_bar->b_arrived,
+                         other_threads[thr_bar->parent_tid]);
       flag.release();
     } else {
       // Leaf does special release on "offset" bits of parent's b_arrived flag
       thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
-      kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
+      kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
+                           thr_bar->offset + 1);
       flag.set_waiter(other_threads[thr_bar->parent_tid]);
       flag.release();
     }
@@ -1057,7 +1073,7 @@ static void __kmp_hierarchical_barrier_release(
         thr_bar->team == NULL) {
       // Use traditional method of waiting on my own b_go flag
       thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
-      kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+      kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
       ANNOTATE_BARRIER_END(this_thr);
       TCW_8(thr_bar->b_go,
@@ -1067,7 +1083,7 @@ static void __kmp_hierarchical_barrier_release(
       // Wait on my "offset" bits on parent's b_go flag
       thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
       kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
-                           thr_bar->offset, bt,
+                           thr_bar->offset + 1, bt,
                            this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
       flag.wait(this_thr, TRUE);
       if (thr_bar->wait_flag ==
@@ -1076,7 +1092,7 @@ static void __kmp_hierarchical_barrier_release(
               KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
       } else { // Reset my bits on parent's b_go flag
         (RCAST(volatile char *,
-               &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
+               &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
       }
     }
     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
@@ -1206,7 +1222,7 @@ static void __kmp_hierarchical_barrier_release(
                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
             // Release child using child's b_go flag
             ANNOTATE_BARRIER_BEGIN(child_thr);
-            kmp_flag_64 flag(&child_bar->b_go, child_thr);
+            kmp_flag_64<> flag(&child_bar->b_go, child_thr);
             flag.release();
           }
         } else { // Release all children at once with leaf_state bits on my own
@@ -1232,7 +1248,7 @@ static void __kmp_hierarchical_barrier_release(
                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
           // Release child using child's b_go flag
           ANNOTATE_BARRIER_BEGIN(child_thr);
-          kmp_flag_64 flag(&child_bar->b_go, child_thr);
+          kmp_flag_64<> flag(&child_bar->b_go, child_thr);
           flag.release();
         }
       }
@@ -1441,7 +1457,8 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
       // Barrier - report frame end (only if active_level == 1)
       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
           __kmp_forkjoin_frames_mode &&
-          this_thr->th.th_teams_microtask == NULL &&
+          (this_thr->th.th_teams_microtask == NULL || // either not in teams
+           this_thr->th.th_teams_size.nteams == 1) && // or inside single team
           team->t.t_active_level == 1) {
         ident_t *loc = __kmp_threads[gtid]->th.th_ident;
         kmp_uint64 cur_time = __itt_get_timestamp();
@@ -1827,7 +1844,9 @@ void __kmp_join_barrier(int gtid) {
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
     // Join barrier - report frame end
     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
-        __kmp_forkjoin_frames_mode && this_thr->th.th_teams_microtask == NULL &&
+        __kmp_forkjoin_frames_mode &&
+        (this_thr->th.th_teams_microtask == NULL || // either not in teams
+         this_thr->th.th_teams_size.nteams == 1) && // or inside single team
         team->t.t_active_level == 1) {
       kmp_uint64 cur_time = __itt_get_timestamp();
       ident_t *loc = team->t.t_ident;
@@ -2115,6 +2134,65 @@ void __kmp_fork_barrier(int gtid, int tid) {
                 team->t.t_id, tid));
 }
 
+#else // !KMP_USE_ABT
+
+#if defined(KMP_GOMP_COMPAT)
+// Returns 1 if cancelled, 0 otherwise
+int __kmp_barrier_gomp_cancel(int gtid) {
+  return 0;
+}
+#endif
+
+int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
+                  size_t reduce_size, void *reduce_data,
+                  void (*reduce)(void *, void *)) {
+  int tid = __kmp_tid_from_gtid(gtid);
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *team = this_thr->th.th_team;
+  int status = 0;
+  int ret;
+  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
+                gtid, __kmp_team_from_gtid(gtid)->t.t_id,
+                __kmp_tid_from_gtid(gtid)));
+  // Complete and free all child tasks.
+  __kmp_abt_wait_child_tasks(this_thr, true, FALSE);
+  if (!team->t.t_serialized) {
+    KMP_MB();
+    kmp_taskdata_t *taskdata = this_thr->th.th_current_task;
+    __kmp_abt_release_info(this_thr);
+    if (KMP_MASTER_TID(tid)) {
+      status = 0;
+      ret = ABT_barrier_wait(team->t.t_team_bar);
+      KMP_DEBUG_ASSERT(ret == ABT_SUCCESS);
+    } else {
+      status = 1;
+      ret = ABT_barrier_wait(team->t.t_team_bar);
+      KMP_DEBUG_ASSERT(ret == ABT_SUCCESS);
+    }
+    __kmp_abt_acquire_info_for_task(this_thr, taskdata, team);
+  } else { // Team is serialized.
+    status = 0;
+  }
+  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
+                gtid, __kmp_team_from_gtid(gtid)->t.t_id,
+                __kmp_tid_from_gtid(gtid), status));
+  return status;
+}
+
+void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *team = this_thr->th.th_team;
+  if (!team->t.t_serialized) {
+    if (KMP_MASTER_GTID(gtid)) {
+      int ret = ABT_barrier_wait(team->t.t_team_bar);
+      KMP_DEBUG_ASSERT(ret == ABT_SUCCESS);
+      (void)ret;
+    }
+  }
+}
+
+#endif // KMP_USE_ABT
+
 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
                           kmp_internal_control_t *new_icvs, ident_t *loc) {
   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
diff --git a/runtime/src/kmp_config.h.cmake b/runtime/src/kmp_config.h.cmake
index e3a1a8db7..315ba1fe0 100644
--- a/runtime/src/kmp_config.h.cmake
+++ b/runtime/src/kmp_config.h.cmake
@@ -42,8 +42,10 @@
 #define USE_DEBUGGER LIBOMP_USE_DEBUGGER
 #cmakedefine01 LIBOMP_OMPT_DEBUG
 #define OMPT_DEBUG LIBOMP_OMPT_DEBUG
-#cmakedefine01 LIBOMP_OMPT_SUPPORT
-#define OMPT_SUPPORT LIBOMP_OMPT_SUPPORT
+#cmakedefine01 LIBBOLT_OMPT_SUPPORT
+#define OMPT_SUPPORT LIBBOLT_OMPT_SUPPORT
+#cmakedefine01 LIBOMPTARGET_PROFILING_SUPPORT
+#define OMPTARGET_PROFILING_SUPPORT LIBOMPTARGET_PROFILING_SUPPORT
 #cmakedefine01 LIBOMP_OMPT_OPTIONAL
 #define OMPT_OPTIONAL LIBOMP_OMPT_OPTIONAL
 #cmakedefine01 LIBOMP_USE_ADAPTIVE_LOCKS
@@ -60,20 +62,42 @@
 #define KMP_USE_HWLOC LIBOMP_USE_HWLOC
 #cmakedefine01 LIBOMP_ENABLE_SHARED
 #define KMP_DYNAMIC_LIB LIBOMP_ENABLE_SHARED
-#define KMP_ARCH_STR "@LIBOMP_LEGAL_ARCH@"
+#define KMP_ARCH_STR "@LIBBOLT_LEGAL_ARCH@"
 #define KMP_LIBRARY_FILE "@LIBOMP_LIB_FILE@"
-#define KMP_VERSION_MAJOR @LIBOMP_VERSION_MAJOR@
-#define KMP_VERSION_MINOR @LIBOMP_VERSION_MINOR@
+#define KMP_VERSION_MAJOR @LIBBOLT_VERSION_MAJOR@
+#define KMP_VERSION_MINOR @LIBBOLT_VERSION_MINOR@
 #cmakedefine01 LIBOMP_TSAN_SUPPORT
 #if LIBOMP_TSAN_SUPPORT
 #define TSAN_SUPPORT
 #endif
 #cmakedefine01 MSVC
 #define KMP_MSVC_COMPAT MSVC
+#cmakedefine01 LIBOMP_HAVE_WAITPKG_INTRINSICS
+#define KMP_HAVE_WAITPKG_INTRINSICS LIBOMP_HAVE_WAITPKG_INTRINSICS
+#cmakedefine01 LIBOMP_HAVE_RTM_INTRINSICS
+#define KMP_HAVE_RTM_INTRINSICS LIBOMP_HAVE_RTM_INTRINSICS
+#cmakedefine01 LIBOMP_HAVE_IMMINTRIN_H
+#define KMP_HAVE_IMMINTRIN_H LIBOMP_HAVE_IMMINTRIN_H
+#cmakedefine01 LIBOMP_HAVE_INTRIN_H
+#define KMP_HAVE_INTRIN_H LIBOMP_HAVE_INTRIN_H
+#cmakedefine01 LIBOMP_HAVE_ATTRIBUTE_WAITPKG
+#define KMP_HAVE_ATTRIBUTE_WAITPKG LIBOMP_HAVE_ATTRIBUTE_WAITPKG
+#cmakedefine01 LIBOMP_HAVE_ATTRIBUTE_RTM
+#define KMP_HAVE_ATTRIBUTE_RTM LIBOMP_HAVE_ATTRIBUTE_RTM
+#cmakedefine01 LIBOMP_ARCH_AARCH64_A64FX
+#define KMP_ARCH_AARCH64_A64FX LIBOMP_ARCH_AARCH64_A64FX
+
+#cmakedefine01 LIBOMP_USE_ARGOBOTS
+#define KMP_USE_ABT LIBOMP_USE_ARGOBOTS
+
+#cmakedefine01 LIBOMP_REMOVE_FORKJOIN_LOCK
+#define KMP_REMOVE_FORKJOIN_LOCK LIBOMP_REMOVE_FORKJOIN_LOCK
 
 // Configured cache line based on architecture
 #if KMP_ARCH_PPC64
 # define CACHE_LINE 128
+#elif KMP_ARCH_AARCH64_A64FX
+# define CACHE_LINE 256
 #else
 # define CACHE_LINE 64
 #endif
@@ -86,9 +110,13 @@
 #define KMP_ADJUST_BLOCKTIME 1
 #define BUILD_PARALLEL_ORDERED 1
 #define KMP_ASM_INTRINS 1
-#define USE_ITT_BUILD LIBOMP_USE_ITT_NOTIFY
+#if !KMP_USE_ABT
+#  define USE_ITT_BUILD LIBOMP_USE_ITT_NOTIFY
+#else
+#  define USE_ITT_BUILD 0
+#endif
 #define INTEL_ITTNOTIFY_PREFIX __kmp_itt_
-#if ! KMP_MIC
+#if ! KMP_MIC && ! KMP_USE_ABT
 # define USE_LOAD_BALANCE 1
 #endif
 #if ! (KMP_OS_WINDOWS || KMP_OS_DARWIN)
diff --git a/runtime/src/kmp_csupport.cpp b/runtime/src/kmp_csupport.cpp
index d39bf9af4..b6c7809de 100644
--- a/runtime/src/kmp_csupport.cpp
+++ b/runtime/src/kmp_csupport.cpp
@@ -18,10 +18,7 @@
 #include "kmp_itt.h"
 #include "kmp_lock.h"
 #include "kmp_stats.h"
-
-#if OMPT_SUPPORT
 #include "ompt-specific.h"
-#endif
 
 #define MAX_MESSAGE 512
 
@@ -95,7 +92,7 @@ construct, since the master thread is necessarily thread zero).
 
 If multiple non-OpenMP threads all enter an OpenMP construct then this
 will be a unique thread identifier among all the threads created by
-the OpenMP runtime (but the value cannote be defined in terms of
+the OpenMP runtime (but the value cannot be defined in terms of
 OpenMP thread ids returned by omp_get_thread_num()).
 */
 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
@@ -234,13 +231,12 @@ void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
                              kmp_int32 num_threads) {
   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
                 global_tid, num_threads));
-
+  __kmp_assert_valid_gtid(global_tid);
   __kmp_push_num_threads(loc, global_tid, num_threads);
 }
 
 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
-
   /* the num_threads are automatically popped */
 }
 
@@ -248,7 +244,7 @@ void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
                            kmp_int32 proc_bind) {
   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
                 proc_bind));
-
+  __kmp_assert_valid_gtid(global_tid);
   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
 }
 
@@ -301,8 +297,8 @@ void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
       }
       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-      OMPT_STORE_RETURN_ADDRESS(gtid);
     }
+    OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
 #if INCLUDE_SSC_MARKS
@@ -311,13 +307,7 @@ void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
-/* TODO: revert workaround for Intel(R) 64 tracker #96 */
-#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
-                    &ap
-#else
-                    ap
-#endif
-                    );
+                    kmp_va_addr_of(ap));
 #if INCLUDE_SSC_MARKS
     SSC_MARK_JOINING();
 #endif
@@ -334,6 +324,7 @@ void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
 #if KMP_STATS_ENABLED
   if (previous_state == stats_state_e::SERIAL_REGION) {
     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
+    KMP_SET_THREAD_STATE(previous_state);
   } else {
     KMP_POP_PARTITIONED_TIMER();
   }
@@ -356,7 +347,7 @@ void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
   KA_TRACE(20,
            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
             global_tid, num_teams, num_threads));
-
+  __kmp_assert_valid_gtid(global_tid);
   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
 }
 
@@ -411,16 +402,10 @@ void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
 
-  __kmp_fork_call(loc, gtid, fork_context_intel, argc,
-                  VOLATILE_CAST(microtask_t)
-                      __kmp_teams_master, // "wrapped" task
-                  VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
-#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
-                  &ap
-#else
-                  ap
-#endif
-                  );
+  __kmp_fork_call(
+      loc, gtid, fork_context_intel, argc,
+      VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
+      VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
   __kmp_join_call(loc, gtid
 #if OMPT_SUPPORT
                   ,
@@ -452,6 +437,7 @@ void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
 #if KMP_STATS_ENABLED
   if (previous_state == stats_state_e::SERIAL_REGION) {
     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
+    KMP_SET_THREAD_STATE(previous_state);
   } else {
     KMP_POP_PARTITIONED_TIMER();
   }
@@ -477,9 +463,10 @@ conditional parallel region, like this,
 when the condition is false.
 */
 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
-// The implementation is now in kmp_runtime.cpp so that it can share static
-// functions with kmp_fork_call since the tasks to be done are similar in
-// each case.
+  // The implementation is now in kmp_runtime.cpp so that it can share static
+  // functions with kmp_fork_call since the tasks to be done are similar in
+  // each case.
+  __kmp_assert_valid_gtid(global_tid);
 #if OMPT_SUPPORT
   OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif
@@ -507,6 +494,7 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     return;
 
   // Not autopar code
+  __kmp_assert_valid_gtid(global_tid);
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
 
@@ -581,7 +569,7 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
 
 /* return to the parallel section */
 
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#if (KMP_ARCH_X86 || KMP_ARCH_X86_64) && !KMP_USE_ABT
     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
       __kmp_clear_x87_fpu_status_word();
       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
@@ -682,17 +670,6 @@ void __kmpc_flush(ident_t *loc) {
 // Nothing to see here move along
 #elif KMP_ARCH_PPC64
 // Nothing needed here (we have a real MB above).
-#if KMP_OS_CNK
-  // The flushing thread needs to yield here; this prevents a
-  // busy-waiting thread from saturating the pipeline. flush is
-  // often used in loops like this:
-  // while (!flag) {
-  //   #pragma omp flush(flag)
-  // }
-  // and adding the yield here is good for at least a 10x speedup
-  // when running >2 threads per core (on the NAS LU benchmark).
-  __kmp_yield();
-#endif
 #else
 #error Unknown or unsupported architecture
 #endif
@@ -716,6 +693,7 @@ Execute a barrier.
 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
   KMP_COUNT_BLOCK(OMP_BARRIER);
   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
@@ -735,8 +713,8 @@ void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     if (ompt_frame->enter_frame.ptr == NULL)
       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-    OMPT_STORE_RETURN_ADDRESS(global_tid);
   }
+  OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif
   __kmp_threads[global_tid]->th.th_ident = loc;
   // TODO: explicit barrier_wait_id:
@@ -765,6 +743,7 @@ kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
   int status = 0;
 
   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
@@ -779,12 +758,12 @@ kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (status) {
-    if (ompt_enabled.ompt_callback_master) {
+    if (ompt_enabled.ompt_callback_masked) {
       kmp_info_t *this_thr = __kmp_threads[global_tid];
       kmp_team_t *team = this_thr->th.th_team;
 
       int tid = __kmp_tid_from_gtid(global_tid);
-      ompt_callbacks.ompt_callback(ompt_callback_master)(
+      ompt_callbacks.ompt_callback(ompt_callback_masked)(
           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
           OMPT_GET_RETURN_ADDRESS(0));
@@ -819,16 +798,16 @@ thread that executes the <tt>master</tt> region.
 */
 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
-
+  __kmp_assert_valid_gtid(global_tid);
   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
   KMP_POP_PARTITIONED_TIMER();
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   kmp_info_t *this_thr = __kmp_threads[global_tid];
   kmp_team_t *team = this_thr->th.th_team;
-  if (ompt_enabled.ompt_callback_master) {
+  if (ompt_enabled.ompt_callback_masked) {
     int tid = __kmp_tid_from_gtid(global_tid);
-    ompt_callbacks.ompt_callback(ompt_callback_master)(
+    ompt_callbacks.ompt_callback(ompt_callback_masked)(
         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
         OMPT_GET_RETURN_ADDRESS(0));
@@ -836,9 +815,6 @@ void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
 #endif
 
   if (__kmp_env_consistency_check) {
-    if (global_tid < 0)
-      KMP_WARNING(ThreadIdentInvalid);
-
     if (KMP_MASTER_GTID(global_tid))
       __kmp_pop_sync(global_tid, ct_master, loc);
   }
@@ -857,6 +833,7 @@ void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(__kmp_init_serial);
 
   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
+  __kmp_assert_valid_gtid(gtid);
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
@@ -874,8 +851,8 @@ void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
   kmp_team_t *team;
   ompt_wait_id_t lck;
   void *codeptr_ra;
+  OMPT_STORE_RETURN_ADDRESS(gtid);
   if (ompt_enabled.enabled) {
-    OMPT_STORE_RETURN_ADDRESS(gtid);
     team = __kmp_team_from_gtid(gtid);
     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
     /* OMPT state update */
@@ -928,6 +905,7 @@ void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
   kmp_info_t *th;
 
   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
+  __kmp_assert_valid_gtid(gtid);
 
 #if USE_ITT_BUILD
   __kmp_itt_ordered_end(gtid);
@@ -1150,7 +1128,7 @@ static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
 /*!
 @ingroup WORK_SHARING
 @param loc  source location information.
-@param global_tid  global thread number .
+@param global_tid  global thread number.
 @param crit identity of the critical section. This could be a pointer to a lock
 associated with the critical section, or some other suitably unique value.
 
@@ -1173,6 +1151,7 @@ void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
   kmp_user_lock_p lck;
 
   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   // TODO: add THR_OVHD_STATE
 
@@ -1270,7 +1249,7 @@ static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
   if (hint & kmp_lock_hint_hle)
     return KMP_TSX_LOCK(hle);
   if (hint & kmp_lock_hint_rtm)
-    return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
+    return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
   if (hint & kmp_lock_hint_adaptive)
     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
 
@@ -1289,9 +1268,9 @@ static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
     return lockseq_tas;
 
-  // HLE lock for speculation
+  // Use RTM lock for speculation
   if (hint & omp_lock_hint_speculative)
-    return KMP_TSX_LOCK(hle);
+    return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
 
   return __kmp_user_lock_seq;
 }
@@ -1312,6 +1291,7 @@ __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
       return kmp_mutex_impl_spin;
 #if KMP_USE_TSX
     case locktag_hle:
+    case locktag_rtm_spin:
       return kmp_mutex_impl_speculative;
 #endif
     default:
@@ -1323,7 +1303,7 @@ __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
   switch (ilock->type) {
 #if KMP_USE_TSX
   case locktag_adaptive:
-  case locktag_rtm:
+  case locktag_rtm_queuing:
     return kmp_mutex_impl_speculative;
 #endif
   case locktag_nested_tas:
@@ -1357,7 +1337,8 @@ static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
     return kmp_mutex_impl_queuing;
 #if KMP_USE_TSX
   case lk_hle:
-  case lk_rtm:
+  case lk_rtm_queuing:
+  case lk_rtm_spin:
   case lk_adaptive:
     return kmp_mutex_impl_speculative;
 #endif
@@ -1395,6 +1376,7 @@ void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
 #endif
 
   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
   // Check if it is initialized.
@@ -1610,8 +1592,8 @@ this function.
 */
 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
   int status;
-
   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
@@ -1627,8 +1609,8 @@ kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     if (ompt_frame->enter_frame.ptr == NULL)
       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-    OMPT_STORE_RETURN_ADDRESS(global_tid);
   }
+  OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif
 #if USE_ITT_NOTIFY
   __kmp_threads[global_tid]->th.th_ident = loc;
@@ -1654,7 +1636,7 @@ still be waiting at the barrier and this call releases them.
 */
 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
-
+  __kmp_assert_valid_gtid(global_tid);
   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
 }
 
@@ -1670,8 +1652,8 @@ There is no equivalent "end" function, since the
 */
 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
   kmp_int32 ret;
-
   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
@@ -1691,8 +1673,8 @@ kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     if (ompt_frame->enter_frame.ptr == NULL)
       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-    OMPT_STORE_RETURN_ADDRESS(global_tid);
   }
+  OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif
 #if USE_ITT_NOTIFY
   __kmp_threads[global_tid]->th.th_ident = loc;
@@ -1709,14 +1691,9 @@ kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
   if (__kmp_env_consistency_check) {
     /*  there's no __kmpc_end_master called; so the (stats) */
     /*  actions of __kmpc_end_master are done here          */
-
-    if (global_tid < 0) {
-      KMP_WARNING(ThreadIdentInvalid);
-    }
     if (ret) {
       /* only one thread should do the pop since only */
       /* one did the push (see __kmpc_master())       */
-
       __kmp_pop_sync(global_tid, ct_master, loc);
     }
   }
@@ -1737,6 +1714,7 @@ should introduce an explicit barrier if it is required.
 */
 
 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
+  __kmp_assert_valid_gtid(global_tid);
   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
 
   if (rc) {
@@ -1789,6 +1767,7 @@ only be called by the thread that executed the block of code protected
 by the `single` construct.
 */
 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
+  __kmp_assert_valid_gtid(global_tid);
   __kmp_exit_single(global_tid);
   KMP_POP_PARTITIONED_TIMER();
 
@@ -1862,7 +1841,7 @@ void ompc_set_dynamic(int flag) {
 
   __kmp_save_internal_controls(thread);
 
-  set__dynamic(thread, flag ? TRUE : FALSE);
+  set__dynamic(thread, flag ? true : false);
 }
 
 void ompc_set_nested(int flag) {
@@ -2068,8 +2047,8 @@ void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
                         void *cpy_data, void (*cpy_func)(void *, void *),
                         kmp_int32 didit) {
   void **data_ptr;
-
   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
+  __kmp_assert_valid_gtid(gtid);
 
   KMP_MB();
 
@@ -2092,8 +2071,8 @@ void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     if (ompt_frame->enter_frame.ptr == NULL)
       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-    OMPT_STORE_RETURN_ADDRESS(gtid);
   }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 /* This barrier is not a barrier region boundary */
 #if USE_ITT_NOTIFY
@@ -2106,11 +2085,9 @@ void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
 
 // Consider next barrier a user-visible barrier for barrier region boundaries
 // Nesting checks are already handled by the single construct checks
-
+  {
 #if OMPT_SUPPORT
-  if (ompt_enabled.enabled) {
     OMPT_STORE_RETURN_ADDRESS(gtid);
-  }
 #endif
 #if USE_ITT_NOTIFY
   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
@@ -2122,6 +2099,7 @@ void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
     ompt_frame->enter_frame = ompt_data_none;
   }
 #endif
+  }
 }
 
 /* -------------------------------------------------------------------------- */
@@ -2168,7 +2146,8 @@ __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
                                kmp_dyna_lockseq_t seq) {
 #if KMP_USE_TSX
   // Don't have nested lock implementation for speculative locks
-  if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
+  if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
+      seq == lockseq_rtm_spin || seq == lockseq_adaptive)
     seq = __kmp_user_lock_seq;
 #endif
   switch (seq) {
@@ -3353,7 +3332,7 @@ __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
   th->th.th_team = team;
   th->th.th_team_nproc = team->t.t_nproc;
   th->th.th_task_team = team->t.t_task_team[task_state];
-  th->th.th_task_state = task_state;
+  __kmp_type_convert(task_state, &(th->th.th_task_state));
 }
 
 /* 2.a.i. Reduce Block without a terminating barrier */
@@ -3385,6 +3364,7 @@ __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
   kmp_team_t *team;
   int teams_swapped = 0, task_state;
   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   // why do we need this initialization here at all?
   // Reduction clause can not be used as a stand-alone directive.
@@ -3429,13 +3409,18 @@ __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
 
+  OMPT_REDUCTION_DECL(th, global_tid);
   if (packed_reduction_method == critical_reduce_block) {
 
+    OMPT_REDUCTION_BEGIN;
+
     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
     retval = 1;
 
   } else if (packed_reduction_method == empty_reduce_block) {
 
+    OMPT_REDUCTION_BEGIN;
+
     // usage: if team size == 1, no synchronization is required ( Intel
     // platforms only )
     retval = 1;
@@ -3479,8 +3464,8 @@ __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame.ptr == NULL)
         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-      OMPT_STORE_RETURN_ADDRESS(global_tid);
     }
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif
 #if USE_ITT_NOTIFY
     __kmp_threads[global_tid]->th.th_ident = loc;
@@ -3533,18 +3518,24 @@ void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
   PACKED_REDUCTION_METHOD_T packed_reduction_method;
 
   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
 
+  OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
+
   if (packed_reduction_method == critical_reduce_block) {
 
     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
+    OMPT_REDUCTION_END;
 
   } else if (packed_reduction_method == empty_reduce_block) {
 
     // usage: if team size == 1, no synchronization is required ( on Intel
     // platforms only )
 
+    OMPT_REDUCTION_END;
+
   } else if (packed_reduction_method == atomic_reduce_block) {
 
     // neither master nor other workers should get here
@@ -3556,6 +3547,7 @@ void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
                                    tree_reduce_block)) {
 
     // only master gets here
+    // OMPT: tree reduction is annotated in the barrier code
 
   } else {
 
@@ -3601,6 +3593,7 @@ kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
   int teams_swapped = 0, task_state;
 
   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   // why do we need this initialization here at all?
   // Reduction clause can not be a stand-alone directive.
@@ -3629,13 +3622,17 @@ kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
 
+  OMPT_REDUCTION_DECL(th, global_tid);
+
   if (packed_reduction_method == critical_reduce_block) {
 
+    OMPT_REDUCTION_BEGIN;
     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
     retval = 1;
 
   } else if (packed_reduction_method == empty_reduce_block) {
 
+    OMPT_REDUCTION_BEGIN;
     // usage: if team size == 1, no synchronization is required ( Intel
     // platforms only )
     retval = 1;
@@ -3656,8 +3653,8 @@ kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame.ptr == NULL)
         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-      OMPT_STORE_RETURN_ADDRESS(global_tid);
     }
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif
 #if USE_ITT_NOTIFY
     __kmp_threads[global_tid]->th.th_ident =
@@ -3715,6 +3712,7 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
   int teams_swapped = 0, task_state;
 
   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
 
   th = __kmp_thread_from_gtid(global_tid);
   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
@@ -3723,10 +3721,13 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
 
   // this barrier should be visible to a customer and to the threading profile
   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
+  OMPT_REDUCTION_DECL(th, global_tid);
 
   if (packed_reduction_method == critical_reduce_block) {
     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
 
+    OMPT_REDUCTION_END;
+
 // TODO: implicit barrier: should be exposed
 #if OMPT_SUPPORT
     ompt_frame_t *ompt_frame;
@@ -3734,8 +3735,8 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame.ptr == NULL)
         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-      OMPT_STORE_RETURN_ADDRESS(global_tid);
     }
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif
 #if USE_ITT_NOTIFY
     __kmp_threads[global_tid]->th.th_ident = loc;
@@ -3749,6 +3750,8 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
 
   } else if (packed_reduction_method == empty_reduce_block) {
 
+    OMPT_REDUCTION_END;
+
 // usage: if team size==1, no synchronization is required (Intel platforms only)
 
 // TODO: implicit barrier: should be exposed
@@ -3758,8 +3761,8 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame.ptr == NULL)
         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-      OMPT_STORE_RETURN_ADDRESS(global_tid);
     }
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif
 #if USE_ITT_NOTIFY
     __kmp_threads[global_tid]->th.th_ident = loc;
@@ -3779,8 +3782,8 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame.ptr == NULL)
         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-      OMPT_STORE_RETURN_ADDRESS(global_tid);
     }
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif
 // TODO: implicit barrier: should be exposed
 #if USE_ITT_NOTIFY
@@ -3866,6 +3869,7 @@ e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
 */
 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
                           const struct kmp_dim *dims) {
+  __kmp_assert_valid_gtid(gtid);
   int j, idx;
   kmp_int64 last, trace_count;
   kmp_info_t *th = __kmp_threads[gtid];
@@ -3961,7 +3965,8 @@ void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
 #endif
   if (flags == NULL) {
     // we are the first thread, allocate the array of flags
-    size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
+    size_t size =
+        (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
     KMP_MB();
     sh_buf->doacross_flags = flags;
@@ -3985,7 +3990,9 @@ void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
 }
 
 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
-  kmp_int32 shft, num_dims, i;
+  __kmp_assert_valid_gtid(gtid);
+  kmp_int64 shft;
+  size_t num_dims, i;
   kmp_uint32 flag;
   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
   kmp_info_t *th = __kmp_threads[gtid];
@@ -4002,10 +4009,13 @@ void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
   // calculate sequential iteration number and check out-of-bounds condition
   pr_buf = th->th.th_dispatch;
   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
-  num_dims = pr_buf->th_doacross_info[0];
+  num_dims = (size_t)pr_buf->th_doacross_info[0];
   lo = pr_buf->th_doacross_info[2];
   up = pr_buf->th_doacross_info[3];
   st = pr_buf->th_doacross_info[4];
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_dependence_t deps[num_dims];
+#endif
   if (st == 1) { // most common case
     if (vec[0] < lo || vec[0] > up) {
       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
@@ -4031,9 +4041,13 @@ void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
     }
     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
   }
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  deps[0].variable.value = iter_number;
+  deps[0].dependence_type = ompt_dependence_type_sink;
+#endif
   for (i = 1; i < num_dims; ++i) {
     kmp_int64 iter, ln;
-    kmp_int32 j = i * 4;
+    size_t j = i * 4;
     ln = pr_buf->th_doacross_info[j + 1];
     lo = pr_buf->th_doacross_info[j + 2];
     up = pr_buf->th_doacross_info[j + 3];
@@ -4064,6 +4078,10 @@ void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
       iter = (kmp_uint64)(lo - vec[i]) / (-st);
     }
     iter_number = iter + ln * iter_number;
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    deps[i].variable.value = iter;
+    deps[i].dependence_type = ompt_dependence_type_sink;
+#endif
   }
   shft = iter_number % 32; // use 32-bit granularity
   iter_number >>= 5; // divided by 32
@@ -4072,13 +4090,21 @@ void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
     KMP_YIELD(TRUE);
   }
   KMP_MB();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_dependences) {
+    ompt_callbacks.ompt_callback(ompt_callback_dependences)(
+        &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
+  }
+#endif
   KA_TRACE(20,
            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
             gtid, (iter_number << 5) + shft));
 }
 
 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
-  kmp_int32 shft, num_dims, i;
+  __kmp_assert_valid_gtid(gtid);
+  kmp_int64 shft;
+  size_t num_dims, i;
   kmp_uint32 flag;
   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
   kmp_info_t *th = __kmp_threads[gtid];
@@ -4096,9 +4122,12 @@ void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
   // out-of-bounds checks)
   pr_buf = th->th.th_dispatch;
   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
-  num_dims = pr_buf->th_doacross_info[0];
+  num_dims = (size_t)pr_buf->th_doacross_info[0];
   lo = pr_buf->th_doacross_info[2];
   st = pr_buf->th_doacross_info[4];
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_dependence_t deps[num_dims];
+#endif
   if (st == 1) { // most common case
     iter_number = vec[0] - lo;
   } else if (st > 0) {
@@ -4106,9 +4135,13 @@ void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
   } else { // negative increment
     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
   }
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  deps[0].variable.value = iter_number;
+  deps[0].dependence_type = ompt_dependence_type_source;
+#endif
   for (i = 1; i < num_dims; ++i) {
     kmp_int64 iter, ln;
-    kmp_int32 j = i * 4;
+    size_t j = i * 4;
     ln = pr_buf->th_doacross_info[j + 1];
     lo = pr_buf->th_doacross_info[j + 2];
     st = pr_buf->th_doacross_info[j + 4];
@@ -4120,7 +4153,17 @@ void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
       iter = (kmp_uint64)(lo - vec[i]) / (-st);
     }
     iter_number = iter + ln * iter_number;
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    deps[i].variable.value = iter;
+    deps[i].dependence_type = ompt_dependence_type_source;
+#endif
   }
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_dependences) {
+    ompt_callbacks.ompt_callback(ompt_callback_dependences)(
+        &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
+  }
+#endif
   shft = iter_number % 32; // use 32-bit granularity
   iter_number >>= 5; // divided by 32
   flag = 1 << shft;
@@ -4132,6 +4175,7 @@ void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
 }
 
 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
+  __kmp_assert_valid_gtid(gtid);
   kmp_int32 num_done;
   kmp_info_t *th = __kmp_threads[gtid];
   kmp_team_t *team = th->th.th_team;
@@ -4142,7 +4186,8 @@ void __kmpc_doacross_fini(ident_t *loc, int gtid) {
     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
     return; // nothing to do
   }
-  num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
+  num_done =
+      KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
   if (num_done == th->th.th_team_nproc) {
     // we are the last thread, need to free shared resources
     int idx = pr_buf->th_doacross_buf_idx - 1;
@@ -4165,11 +4210,21 @@ void __kmpc_doacross_fini(ident_t *loc, int gtid) {
   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
 }
 
-/* omp_alloc/omp_free only defined for C/C++, not for Fortran */
+/* omp_alloc/omp_calloc/omp_free only defined for C/C++, not for Fortran */
 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
 }
 
+void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
+  return __kmpc_calloc(__kmp_entry_gtid(), nmemb, size, allocator);
+}
+
+void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
+                  omp_allocator_handle_t free_allocator) {
+  return __kmpc_realloc(__kmp_entry_gtid(), ptr, size, allocator,
+                        free_allocator);
+}
+
 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
 }
diff --git a/runtime/src/kmp_debugger.cpp b/runtime/src/kmp_debugger.cpp
index 490300f9b..2a1f633c4 100644
--- a/runtime/src/kmp_debugger.cpp
+++ b/runtime/src/kmp_debugger.cpp
@@ -269,7 +269,7 @@ int __kmp_omp_num_threads(ident_t const *ident) {
   if (info->num > 0 && info->array != 0) {
     kmp_omp_nthr_item_t *items =
         (kmp_omp_nthr_item_t *)__kmp_convert_to_ptr(info->array);
-    kmp_str_loc_t loc = __kmp_str_loc_init(ident->psource, 1);
+    kmp_str_loc_t loc = __kmp_str_loc_init(ident->psource, true);
     int i;
     for (i = 0; i < info->num; ++i) {
       if (kmp_location_match(&loc, &items[i])) {
diff --git a/runtime/src/kmp_dispatch.cpp b/runtime/src/kmp_dispatch.cpp
index 161a2c696..a805ee44d 100644
--- a/runtime/src/kmp_dispatch.cpp
+++ b/runtime/src/kmp_dispatch.cpp
@@ -69,16 +69,24 @@ void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
 }
 
 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
-static inline int __kmp_get_monotonicity(enum sched_type schedule,
+static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
                                          bool use_hier = false) {
   // Pick up the nonmonotonic/monotonic bits from the scheduling type
-  int monotonicity;
-  // default to monotonic
-  monotonicity = SCHEDULE_MONOTONIC;
-  if (SCHEDULE_HAS_NONMONOTONIC(schedule))
+  // TODO: make nonmonotonic when static_steal is fixed
+  int monotonicity = SCHEDULE_MONOTONIC;
+
+  // Let default be monotonic for executables
+  // compiled with OpenMP* 4.5 or less compilers
+  if (loc->get_openmp_version() < 50)
+    monotonicity = SCHEDULE_MONOTONIC;
+
+  if (use_hier)
+    monotonicity = SCHEDULE_MONOTONIC;
+  else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
     monotonicity = SCHEDULE_NONMONOTONIC;
   else if (SCHEDULE_HAS_MONOTONIC(schedule))
     monotonicity = SCHEDULE_MONOTONIC;
+
   return monotonicity;
 }
 
@@ -146,7 +154,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
 #endif
 
   /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
-  monotonicity = __kmp_get_monotonicity(schedule, use_hier);
+  monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
   schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
 
   /* Pick up the nomerge/ordered bits from the scheduling type */
@@ -177,7 +185,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
       // not specified)
       schedule = team->t.t_sched.r_sched_type;
-      monotonicity = __kmp_get_monotonicity(schedule, use_hier);
+      monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
       // Detail the schedule if needed (global controls are differentiated
       // appropriately)
@@ -244,7 +252,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
     if (schedule == kmp_sch_runtime_simd) {
       // compiler provides simd_width in the chunk parameter
       schedule = team->t.t_sched.r_sched_type;
-      monotonicity = __kmp_get_monotonicity(schedule, use_hier);
+      monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
       // Detail the schedule if needed (global controls are differentiated
       // appropriately)
@@ -372,21 +380,22 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
         // before spending time on this).
         // For now use dynamically allocated per-thread lock,
         // free memory in __kmp_dispatch_next when status==0.
-        KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
-        th->th.th_dispatch->th_steal_lock =
+        KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
+        pr->u.p.th_steal_lock =
             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
-        __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
+        __kmp_init_lock(pr->u.p.th_steal_lock);
       }
       break;
     } else {
-      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
-                     "kmp_sch_static_balanced\n",
-                     gtid));
-      schedule = kmp_sch_static_balanced;
-      /* too few iterations: fall-through to kmp_sch_static_balanced */
+      /* too few chunks: switching to kmp_sch_dynamic_chunked */
+      schedule = kmp_sch_dynamic_chunked;
+      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
+                     "kmp_sch_dynamic_chunked\n",
+                      gtid));
+      if (pr->u.p.parm1 <= 0)
+        pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
+      break;
     } // if
-    /* FALL-THROUGH to static balanced */
-    KMP_FALLTHROUGH();
   } // case
 #endif
   case kmp_sch_static_balanced: {
@@ -485,7 +494,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
         // when remaining iters become less than parm2 - switch to dynamic
         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
         *(double *)&pr->u.p.parm3 =
-            guided_flt_param / nproc; // may occupy parm3 and parm4
+            guided_flt_param / (double)nproc; // may occupy parm3 and parm4
       }
     } else {
       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
@@ -535,7 +544,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
         UT cross;
 
         /* commonly used term: (2 nproc - 1)/(2 nproc) */
-        x = (long double)1.0 - (long double)0.5 / nproc;
+        x = 1.0 - 0.5 / (double)nproc;
 
 #ifdef KMP_DEBUG
         { // test natural alignment
@@ -772,6 +781,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
                    sizeof(dispatch_private_info));
   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
                    sizeof(dispatch_shared_info));
+  __kmp_assert_valid_gtid(gtid);
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
@@ -967,7 +977,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
   // all parm3 will be the same, it still exists a bad case like using 0 and 1
   // rather than program life-time increment. So the dedicated variable is
   // required. The 'static_steal_counter' is used.
-  if (schedule == kmp_sch_static_steal) {
+  if (pr->schedule == kmp_sch_static_steal) {
     // Other threads will inspect this variable when searching for a victim.
     // This is a flag showing that other threads may steal from this thread
     // since then.
@@ -996,6 +1006,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
 template <typename UT>
 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
   typedef typename traits_t<UT>::signed_t ST;
+  __kmp_assert_valid_gtid(gtid);
   kmp_info_t *th = __kmp_threads[gtid];
 
   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
@@ -1059,6 +1070,7 @@ static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
 template <typename UT>
 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
   typedef typename traits_t<UT>::signed_t ST;
+  __kmp_assert_valid_gtid(gtid);
   kmp_info_t *th = __kmp_threads[gtid];
 
   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
@@ -1146,7 +1158,7 @@ int __kmp_dispatch_next_algorithm(int gtid,
   typedef typename traits_t<T>::signed_t ST;
   typedef typename traits_t<T>::floating_t DBL;
   int status = 0;
-  kmp_int32 last = 0;
+  bool last = false;
   T start;
   ST incr;
   UT limit, trip, init;
@@ -1194,7 +1206,7 @@ int __kmp_dispatch_next_algorithm(int gtid,
     if (traits_t<T>::type_size > 4) {
       // use lock for 8-byte and CAS for 4-byte induction
       // variable. TODO (optional): check and use 16-byte CAS
-      kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
+      kmp_lock_t *lck = pr->u.p.th_steal_lock;
       KMP_DEBUG_ASSERT(lck != NULL);
       if (pr->u.p.count < (UT)pr->u.p.ub) {
         __kmp_acquire_lock(lck, gtid);
@@ -1207,39 +1219,40 @@ int __kmp_dispatch_next_algorithm(int gtid,
       }
       if (!status) { // try to steal
         kmp_info_t **other_threads = team->t.t_threads;
-        int while_limit = pr->u.p.parm3;
-        int while_index = 0;
+        T while_limit = pr->u.p.parm3;
+        T while_index = 0;
+        T id = pr->u.p.static_steal_counter; // loop id
+        int idx = (th->th.th_dispatch->th_disp_index - 1) %
+                  __kmp_dispatch_num_buffers; // current loop index
+        // note: victim thread can potentially execute another loop
         // TODO: algorithm of searching for a victim
         // should be cleaned up and measured
         while ((!status) && (while_limit != ++while_index)) {
+          dispatch_private_info_template<T> *victim;
           T remaining;
           T victimIdx = pr->u.p.parm4;
           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
-          dispatch_private_info_template<T> *victim =
-              reinterpret_cast<dispatch_private_info_template<T> *>(
-                  other_threads[victimIdx]
-                      ->th.th_dispatch->th_dispatch_pr_current);
-          while ((victim == NULL || victim == pr ||
-                  (*(volatile T *)&victim->u.p.static_steal_counter !=
-                   *(volatile T *)&pr->u.p.static_steal_counter)) &&
+          victim = reinterpret_cast<dispatch_private_info_template<T> *>(
+              &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
+          KMP_DEBUG_ASSERT(victim);
+          while ((victim == pr || id != victim->u.p.static_steal_counter) &&
                  oldVictimIdx != victimIdx) {
             victimIdx = (victimIdx + 1) % nproc;
             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
-                other_threads[victimIdx]
-                    ->th.th_dispatch->th_dispatch_pr_current);
+                &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
+            KMP_DEBUG_ASSERT(victim);
           }
-          if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
-                          *(volatile T *)&pr->u.p.static_steal_counter)) {
+          if (victim == pr || id != victim->u.p.static_steal_counter) {
             continue; // try once more (nproc attempts in total)
             // no victim is ready yet to participate in stealing
-            // because all victims are still in kmp_init_dispatch
+            // because no victim passed kmp_init_dispatch yet
           }
           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
             continue; // not enough chunks to steal, goto next victim
           }
 
-          lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
+          lck = victim->u.p.th_steal_lock;
           KMP_ASSERT(lck != NULL);
           __kmp_acquire_lock(lck, gtid);
           limit = victim->u.p.ub; // keep initial ub
@@ -1249,7 +1262,7 @@ int __kmp_dispatch_next_algorithm(int gtid,
             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
             continue; // not enough chunks to steal
           }
-          // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
+          // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or
           // by 1
           if (remaining > 3) {
             // steal 1/4 of remaining
@@ -1267,10 +1280,10 @@ int __kmp_dispatch_next_algorithm(int gtid,
           status = 1;
           while_index = 0;
           // now update own count and ub with stolen range but init chunk
-          __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
+          __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
           pr->u.p.count = init + 1;
           pr->u.p.ub = limit;
-          __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
+          __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
         } // while (search for victim)
       } // if (try to find victim and steal)
     } else {
@@ -1305,34 +1318,34 @@ int __kmp_dispatch_next_algorithm(int gtid,
 
       if (!status) {
         kmp_info_t **other_threads = team->t.t_threads;
-        int while_limit = pr->u.p.parm3;
-        int while_index = 0;
-
+        T while_limit = pr->u.p.parm3;
+        T while_index = 0;
+        T id = pr->u.p.static_steal_counter; // loop id
+        int idx = (th->th.th_dispatch->th_disp_index - 1) %
+                  __kmp_dispatch_num_buffers; // current loop index
+        // note: victim thread can potentially execute another loop
         // TODO: algorithm of searching for a victim
         // should be cleaned up and measured
         while ((!status) && (while_limit != ++while_index)) {
+          dispatch_private_info_template<T> *victim;
           union_i4 vold, vnew;
-          kmp_int32 remaining;
+          T remaining;
           T victimIdx = pr->u.p.parm4;
           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
-          dispatch_private_info_template<T> *victim =
-              reinterpret_cast<dispatch_private_info_template<T> *>(
-                  other_threads[victimIdx]
-                      ->th.th_dispatch->th_dispatch_pr_current);
-          while ((victim == NULL || victim == pr ||
-                  (*(volatile T *)&victim->u.p.static_steal_counter !=
-                   *(volatile T *)&pr->u.p.static_steal_counter)) &&
+          victim = reinterpret_cast<dispatch_private_info_template<T> *>(
+              &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
+          KMP_DEBUG_ASSERT(victim);
+          while ((victim == pr || id != victim->u.p.static_steal_counter) &&
                  oldVictimIdx != victimIdx) {
             victimIdx = (victimIdx + 1) % nproc;
             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
-                other_threads[victimIdx]
-                    ->th.th_dispatch->th_dispatch_pr_current);
+                &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
+            KMP_DEBUG_ASSERT(victim);
           }
-          if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
-                          *(volatile T *)&pr->u.p.static_steal_counter)) {
+          if (victim == pr || id != victim->u.p.static_steal_counter) {
             continue; // try once more (nproc attempts in total)
             // no victim is ready yet to participate in stealing
-            // because all victims are still in kmp_init_dispatch
+            // because no victim passed kmp_init_dispatch yet
           }
           pr->u.p.parm4 = victimIdx; // new victim found
           while (1) { // CAS loop if victim has enough chunks to steal
@@ -1346,7 +1359,8 @@ int __kmp_dispatch_next_algorithm(int gtid,
               break; // not enough chunks to steal, goto next victim
             }
             if (remaining > 3) {
-              vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
+              // try to steal 1/4 of remaining
+              vnew.p.ub -= remaining >> 2;
             } else {
               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
             }
@@ -1356,7 +1370,7 @@ int __kmp_dispatch_next_algorithm(int gtid,
                     (volatile kmp_int64 *)&victim->u.p.count,
                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
-              // stealing succedded
+              // stealing succeeded
               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
                                         vold.p.ub - vnew.p.ub);
               status = 1;
@@ -1371,7 +1385,7 @@ int __kmp_dispatch_next_algorithm(int gtid,
 #endif
               break;
             } // if (check CAS result)
-            KMP_CPU_PAUSE(); // CAS failed, repeate attempt
+            KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
           } // while (try to steal from particular victim)
         } // while (search for victim)
       } // if (try to find victim and steal)
@@ -1420,7 +1434,7 @@ int __kmp_dispatch_next_algorithm(int gtid,
       pr->u.p.count = 1;
       *p_lb = pr->u.p.lb;
       *p_ub = pr->u.p.ub;
-      last = pr->u.p.parm1;
+      last = (pr->u.p.parm1 != 0);
       if (p_st != NULL)
         *p_st = pr->u.p.st;
     } else { /* no iterations to do */
@@ -1531,8 +1545,8 @@ int __kmp_dispatch_next_algorithm(int gtid,
       }
       if ((T)remaining <
           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
-        // use dynamic-style shcedule
-        // atomically inrement iterations, get old value
+        // use dynamic-style schedule
+        // atomically increment iterations, get old value
         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
                                  (ST)chunkspec);
         remaining = trip - init;
@@ -1544,14 +1558,14 @@ int __kmp_dispatch_next_algorithm(int gtid,
           if ((T)remaining > chunkspec) {
             limit = init + chunkspec - 1;
           } else {
-            last = 1; // the last chunk
+            last = true; // the last chunk
             limit = init + remaining - 1;
           } // if
         } // if
         break;
       } // if
-      limit = init +
-              (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
+      limit = init + (UT)((double)remaining *
+                          *(double *)&pr->u.p.parm3); // divide by K*nproc
       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
                                (ST)init, (ST)limit)) {
         // CAS was successful, chunk obtained
@@ -1600,8 +1614,8 @@ int __kmp_dispatch_next_algorithm(int gtid,
       KMP_DEBUG_ASSERT(init % chunk == 0);
       // compare with K*nproc*(chunk+1), K=2 by default
       if ((T)remaining < pr->u.p.parm2) {
-        // use dynamic-style shcedule
-        // atomically inrement iterations, get old value
+        // use dynamic-style schedule
+        // atomically increment iterations, get old value
         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
                                  (ST)chunk);
         remaining = trip - init;
@@ -1613,14 +1627,16 @@ int __kmp_dispatch_next_algorithm(int gtid,
           if ((T)remaining > chunk) {
             limit = init + chunk - 1;
           } else {
-            last = 1; // the last chunk
+            last = true; // the last chunk
             limit = init + remaining - 1;
           } // if
         } // if
         break;
       } // if
       // divide by K*nproc
-      UT span = remaining * (*(double *)&pr->u.p.parm3);
+      UT span;
+      __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
+                         &span);
       UT rem = span % chunk;
       if (rem) // adjust so that span%chunk == 0
         span += chunk - rem;
@@ -1891,13 +1907,14 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
   typedef typename traits_t<T>::unsigned_t UT;
   typedef typename traits_t<T>::signed_t ST;
   // This is potentially slightly misleading, schedule(runtime) will appear here
-  // even if the actual runtme schedule is static. (Which points out a
-  // disadavantage of schedule(runtime): even when static scheduling is used it
+  // even if the actual runtime schedule is static. (Which points out a
+  // disadvantage of schedule(runtime): even when static scheduling is used it
   // costs more than a compile time choice to use static scheduling would.)
   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
 
   int status;
   dispatch_private_info_template<T> *pr;
+  __kmp_assert_valid_gtid(gtid);
   kmp_info_t *th = __kmp_threads[gtid];
   kmp_team_t *team = th->th.th_team;
 
@@ -1908,7 +1925,7 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
        gtid, p_lb, p_ub, p_st, p_last));
 
   if (team->t.t_serialized) {
-    /* NOTE: serialize this dispatch becase we are not at the active level */
+    /* NOTE: serialize this dispatch because we are not at the active level */
     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
     KMP_DEBUG_ASSERT(pr);
@@ -2010,7 +2027,8 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
-      KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
+      KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
+                    (p_last ? *p_last : 0), status));
       __kmp_str_free(&buff);
     }
 #endif
@@ -2067,14 +2085,19 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
         if (pr->schedule == kmp_sch_static_steal &&
             traits_t<T>::type_size > 4) {
           int i;
+          int idx = (th->th.th_dispatch->th_disp_index - 1) %
+                    __kmp_dispatch_num_buffers; // current loop index
           kmp_info_t **other_threads = team->t.t_threads;
           // loop complete, safe to destroy locks used for stealing
           for (i = 0; i < th->th.th_team_nproc; ++i) {
-            kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
+            dispatch_private_info_template<T> *buf =
+                reinterpret_cast<dispatch_private_info_template<T> *>(
+                    &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
+            kmp_lock_t *lck = buf->u.p.th_steal_lock;
             KMP_ASSERT(lck != NULL);
             __kmp_destroy_lock(lck);
             __kmp_free(lck);
-            other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
+            buf->u.p.th_steal_lock = NULL;
           }
         }
 #endif
@@ -2185,6 +2208,7 @@ static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
     }
   }
+  __kmp_assert_valid_gtid(gtid);
   th = __kmp_threads[gtid];
   team = th->th.th_team;
   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
diff --git a/runtime/src/kmp_dispatch.h b/runtime/src/kmp_dispatch.h
index 8b3e98435..1f98e4b80 100644
--- a/runtime/src/kmp_dispatch.h
+++ b/runtime/src/kmp_dispatch.h
@@ -75,7 +75,7 @@ template <typename T> struct dispatch_private_infoXX_template {
   ST st; // signed
   UT tc; // unsigned
   T static_steal_counter; // for static_steal only; maybe better to put after ub
-
+  kmp_lock_t *th_steal_lock; // lock used for chunk stealing
   /* parm[1-4] are used in different ways by different scheduling algorithms */
 
   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
diff --git a/runtime/src/kmp_dispatch_hier.h b/runtime/src/kmp_dispatch_hier.h
index 24a6d6691..721c7f678 100644
--- a/runtime/src/kmp_dispatch_hier.h
+++ b/runtime/src/kmp_dispatch_hier.h
@@ -273,7 +273,7 @@ void core_barrier_impl<T>::barrier(kmp_int32 id,
                 "next_index:%llu curr_wait:%llu next_wait:%llu\n",
                 __kmp_get_gtid(), current_index, next_index, current_wait_value,
                 next_wait_value));
-  char v = (current_wait_value ? 0x1 : 0x0);
+  char v = (current_wait_value ? '\1' : '\0');
   (RCAST(volatile char *, &(bdata->val[current_index])))[id] = v;
   __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
                          __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
@@ -537,8 +537,10 @@ template <typename T> struct kmp_hier_t {
       // When no iterations are found (status == 0) and this is not the last
       // layer, attempt to go up the hierarchy for more iterations
       if (status == 0 && !last_layer) {
+        kmp_int32 hid;
+        __kmp_type_convert(hier_id, &hid);
         status = next_recurse(loc, gtid, parent, &contains_last, &my_lb, &my_ub,
-                              &my_st, hier_id, hier_level + 1);
+                              &my_st, hid, hier_level + 1);
         KD_TRACE(
             10,
             ("kmp_hier_t.next_recurse(): T#%d (%d) hier_next() returned %d\n",
@@ -748,8 +750,10 @@ template <typename T> struct kmp_hier_t {
           bool done = false;
           while (!done) {
             done = true;
+            kmp_int32 uid;
+            __kmp_type_convert(unit_id, &uid);
             status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
-                                  p_st, unit_id, 0);
+                                  p_st, uid, 0);
             if (status == 1) {
               __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
                                             parent->get_next_lb(tdata->index),
@@ -803,8 +807,10 @@ template <typename T> struct kmp_hier_t {
         bool done = false;
         while (!done) {
           done = true;
+          kmp_int32 uid;
+          __kmp_type_convert(unit_id, &uid);
           status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
-                                p_st, unit_id, 0);
+                                p_st, uid, 0);
           if (status == 1) {
             sh = parent->get_curr_sh(tdata->index);
             __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
@@ -993,7 +999,7 @@ void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,
     th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)__kmp_allocate(
         sizeof(kmp_hier_private_bdata_t) * kmp_hier_layer_e::LAYER_LAST);
   }
-  // Have threads "register" themselves by modifiying the active count for each
+  // Have threads "register" themselves by modifying the active count for each
   // level they are involved in. The active count will act as nthreads for that
   // level regarding the scheduling algorithms
   for (int i = 0; i < n; ++i) {
@@ -1071,7 +1077,7 @@ void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,
     my_unit->reset_shared_barrier();
     my_unit->hier_pr.flags.contains_last = FALSE;
     // Last layer, initialize the private buffers with entire loop information
-    // Now the next next_algorithim() call will get the first chunk of
+    // Now the next next_algorithm() call will get the first chunk of
     // iterations properly
     if (i == n - 1) {
       __kmp_dispatch_init_algorithm<T>(
diff --git a/runtime/src/kmp_environment.cpp b/runtime/src/kmp_environment.cpp
index 51bc3cf45..19c59be6c 100644
--- a/runtime/src/kmp_environment.cpp
+++ b/runtime/src/kmp_environment.cpp
@@ -380,16 +380,13 @@ static void
 ___kmp_env_blk_parse_unix(kmp_env_blk_t *block, // M: Env block to fill.
                           char **env // I: Unix environment to parse.
                           ) {
-
   char *bulk = NULL;
   kmp_env_var_t *vars = NULL;
   int count = 0;
-  int size = 0; // Size of bulk.
+  size_t size = 0; // Size of bulk.
 
   // Count number of variables and length of required bulk.
   {
-    count = 0;
-    size = 0;
     while (env[count] != NULL) {
       size += KMP_STRLEN(env[count]) + 1;
       ++count;
@@ -405,7 +402,7 @@ ___kmp_env_blk_parse_unix(kmp_env_blk_t *block, // M: Env block to fill.
     char *var; // Pointer to beginning of var.
     char *name; // Pointer to name of variable.
     char *value; // Pointer to value.
-    int len; // Length of variable.
+    size_t len; // Length of variable.
     int i;
     var = bulk;
     for (i = 0; i < count; ++i) {
diff --git a/runtime/src/kmp_environment.h b/runtime/src/kmp_environment.h
index 76a9672f3..a7ea9e955 100644
--- a/runtime/src/kmp_environment.h
+++ b/runtime/src/kmp_environment.h
@@ -1,5 +1,5 @@
 /*
- * kmp_environment.h -- Handle environment varoiables OS-independently.
+ * kmp_environment.h -- Handle environment variables OS-independently.
  */
 
 //===----------------------------------------------------------------------===//
diff --git a/runtime/src/kmp_error.cpp b/runtime/src/kmp_error.cpp
index b30b26e3a..7fc0ce17a 100644
--- a/runtime/src/kmp_error.cpp
+++ b/runtime/src/kmp_error.cpp
@@ -415,9 +415,6 @@ void __kmp_pop_sync(int gtid, enum cons_type ct, ident_t const *ident) {
     __kmp_error_construct2(kmp_i18n_msg_CnsExpectedEnd, ct, ident,
                            &p->stack_data[tos]);
   }
-  if (gtid < 0) {
-    __kmp_check_null_func();
-  }
   KE_TRACE(100, (POP_MSG(p)));
   p->s_top = p->stack_data[tos].prev;
   p->stack_data[tos].type = ct_none;
diff --git a/runtime/src/kmp_ftn_entry.h b/runtime/src/kmp_ftn_entry.h
index e480e0151..df7d98c07 100644
--- a/runtime/src/kmp_ftn_entry.h
+++ b/runtime/src/kmp_ftn_entry.h
@@ -78,7 +78,7 @@ void FTN_STDCALL FTN_SET_STACKSIZE_S(size_t KMP_DEREF arg) {
 
 int FTN_STDCALL FTN_GET_STACKSIZE(void) {
 #ifdef KMP_STUB
-  return __kmps_get_stacksize();
+  return (int)__kmps_get_stacksize();
 #else
   if (!__kmp_init_serial) {
     __kmp_serial_initialize();
@@ -530,8 +530,12 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_THREAD_NUM)(void) {
 #else
   int gtid;
 
-#if KMP_OS_DARWIN || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||    \
-        KMP_OS_HURD
+#if KMP_USE_ABT
+  if (!__kmp_init_parallel)
+    return 0;
+  gtid = __kmp_gtid_get_specific();
+#elif KMP_OS_DARWIN || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||  \
+      KMP_OS_HURD|| KMP_OS_OPENBSD
   gtid = __kmp_entry_gtid();
 #elif KMP_OS_WINDOWS
   if (!__kmp_init_parallel ||
@@ -551,8 +555,8 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_THREAD_NUM)(void) {
   } else {
 #endif
     if (!__kmp_init_parallel ||
-        (gtid = (kmp_intptr_t)(
-             pthread_getspecific(__kmp_gtid_threadprivate_key))) == 0) {
+        (gtid = (int)((kmp_intptr_t)(
+             pthread_getspecific(__kmp_gtid_threadprivate_key)))) == 0) {
       return 0;
     }
     --gtid;
@@ -628,7 +632,7 @@ void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_DYNAMIC)(int KMP_DEREF flag) {
   thread = __kmp_entry_thread();
   // !!! What if foreign thread calls it?
   __kmp_save_internal_controls(thread);
-  set__dynamic(thread, KMP_DEREF flag ? TRUE : FALSE);
+  set__dynamic(thread, KMP_DEREF flag ? true : false);
 #endif
 }
 
@@ -654,7 +658,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_IN_PARALLEL)(void) {
     // The solution is to use per-team nesting level
     return (th->th.th_team->t.t_active_level ? 1 : 0);
   } else
-    return (th->th.th_root->r.r_in_parallel ? FTN_TRUE : FTN_FALSE);
+    return (th->th.th_root->r.r_active ? FTN_TRUE : FTN_FALSE);
 #endif
 }
 
@@ -939,7 +943,7 @@ void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_DEFAULT_DEVICE)(int KMP_DEREF arg) {
 
 // Get number of NON-HOST devices.
 // libomptarget, if loaded, provides this function in api.cpp.
-int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) KMP_WEAK_ATTRIBUTE;
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
 int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) {
 #if KMP_MIC || KMP_OS_DARWIN || KMP_OS_WINDOWS || defined(KMP_STUB)
   return 0;
@@ -956,23 +960,25 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) {
 }
 
 // This function always returns true when called on host device.
-// Compilier/libomptarget should handle when it is called inside target region.
-int FTN_STDCALL KMP_EXPAND_NAME(FTN_IS_INITIAL_DEVICE)(void) KMP_WEAK_ATTRIBUTE;
+// Compiler/libomptarget should handle when it is called inside target region.
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_IS_INITIAL_DEVICE)(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
 int FTN_STDCALL KMP_EXPAND_NAME(FTN_IS_INITIAL_DEVICE)(void) {
   return 1; // This is the host
 }
 
 // libomptarget, if loaded, provides this function
-int FTN_STDCALL FTN_GET_INITIAL_DEVICE(void) KMP_WEAK_ATTRIBUTE;
+int FTN_STDCALL FTN_GET_INITIAL_DEVICE(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
 int FTN_STDCALL FTN_GET_INITIAL_DEVICE(void) {
 #if KMP_MIC || KMP_OS_DARWIN || KMP_OS_WINDOWS || defined(KMP_STUB)
-  return KMP_HOST_DEVICE;
+  // same as omp_get_num_devices()
+  return 0;
 #else
   int (*fptr)();
   if ((*(void **)(&fptr) = dlsym(RTLD_NEXT, "omp_get_initial_device"))) {
     return (*fptr)();
   } else { // liboffload & libomptarget don't exist
-    return KMP_HOST_DEVICE;
+    // same as omp_get_num_devices()
+    return 0;
   }
 #endif
 }
@@ -1318,15 +1324,15 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_TASK_PRIORITY)(void) {
 // This function will be defined in libomptarget. When libomptarget is not
 // loaded, we assume we are on the host and return KMP_HOST_DEVICE.
 // Compiler/libomptarget will handle this if called inside target.
-int FTN_STDCALL FTN_GET_DEVICE_NUM(void) KMP_WEAK_ATTRIBUTE;
-int FTN_STDCALL FTN_GET_DEVICE_NUM(void) { return KMP_HOST_DEVICE; }
+int FTN_STDCALL FTN_GET_DEVICE_NUM(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
+int FTN_STDCALL FTN_GET_DEVICE_NUM(void) { return FTN_GET_INITIAL_DEVICE(); }
 
 // Compiler will ensure that this is only called from host in sequential region
 int FTN_STDCALL FTN_PAUSE_RESOURCE(kmp_pause_status_t kind, int device_num) {
 #ifdef KMP_STUB
   return 1; // just fail
 #else
-  if (device_num == KMP_HOST_DEVICE)
+  if (device_num == FTN_GET_INITIAL_DEVICE())
     return __kmpc_pause_resource(kind);
   else {
 #if !KMP_OS_WINDOWS
@@ -1371,6 +1377,13 @@ void FTN_STDCALL FTN_FULFILL_EVENT(kmp_event_t *event) {
 #endif
 }
 
+// display environment variables when requested
+void FTN_STDCALL FTN_DISPLAY_ENV(int verbose) {
+#ifndef KMP_STUB
+  __kmp_omp_display_env(verbose);
+#endif
+}
+
 // GCC compatibility (versioned symbols)
 #ifdef KMP_USE_VERSION_SYMBOLS
 
diff --git a/runtime/src/kmp_ftn_os.h b/runtime/src/kmp_ftn_os.h
index 41cafab12..39958e2db 100644
--- a/runtime/src/kmp_ftn_os.h
+++ b/runtime/src/kmp_ftn_os.h
@@ -133,6 +133,7 @@
 #define FTN_PAUSE_RESOURCE omp_pause_resource
 #define FTN_PAUSE_RESOURCE_ALL omp_pause_resource_all
 #define FTN_GET_SUPPORTED_ACTIVE_LEVELS omp_get_supported_active_levels
+#define FTN_DISPLAY_ENV omp_display_env
 #define FTN_FULFILL_EVENT omp_fulfill_event
 
 #endif /* KMP_FTN_PLAIN */
@@ -256,6 +257,7 @@
 #define FTN_PAUSE_RESOURCE omp_pause_resource_
 #define FTN_PAUSE_RESOURCE_ALL omp_pause_resource_all_
 #define FTN_GET_SUPPORTED_ACTIVE_LEVELS omp_get_supported_active_levels_
+#define FTN_DISPLAY_ENV omp_display_env_
 #define FTN_FULFILL_EVENT omp_fulfill_event_
 
 #endif /* KMP_FTN_APPEND */
@@ -377,6 +379,7 @@
 #define FTN_PAUSE_RESOURCE OMP_PAUSE_RESOURCE
 #define FTN_PAUSE_RESOURCE_ALL OMP_PAUSE_RESOURCE_ALL
 #define FTN_GET_SUPPORTED_ACTIVE_LEVELS OMP_GET_SUPPORTED_ACTIVE_LEVELS
+#define FTN_DISPLAY_ENV OMP_DISPLAY_ENV
 #define FTN_FULFILL_EVENT OMP_FULFILL_EVENT
 
 #endif /* KMP_FTN_UPPER */
@@ -500,6 +503,7 @@
 #define FTN_PAUSE_RESOURCE OMP_PAUSE_RESOURCE_
 #define FTN_PAUSE_RESOURCE_ALL OMP_PAUSE_RESOURCE_ALL_
 #define FTN_GET_SUPPORTED_ACTIVE_LEVELS OMP_GET_SUPPORTED_ACTIVE_LEVELS_
+#define FTN_DISPLAY_ENV OMP_DISPLAY_ENV_
 #define FTN_FULFILL_EVENT OMP_FULFILL_EVENT_
 
 #endif /* KMP_FTN_UAPPEND */
@@ -654,4 +658,28 @@
 #define KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_GUIDED                    \
   GOMP_parallel_loop_nonmonotonic_guided
 
+// All GOMP_5.0 symbols
+#define KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_NEXT                 \
+  GOMP_loop_maybe_nonmonotonic_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_START                \
+  GOMP_loop_maybe_nonmonotonic_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_NEXT                       \
+  GOMP_loop_nonmonotonic_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_START                      \
+  GOMP_loop_nonmonotonic_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_NEXT             \
+  GOMP_loop_ull_maybe_nonmonotonic_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_START            \
+  GOMP_loop_ull_maybe_nonmonotonic_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_NEXT                   \
+  GOMP_loop_ull_nonmonotonic_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_START                  \
+  GOMP_loop_ull_nonmonotonic_runtime_start
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_RUNTIME                   \
+  GOMP_parallel_loop_nonmonotonic_runtime
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_MAYBE_NONMONOTONIC_RUNTIME             \
+  GOMP_parallel_loop_maybe_nonmonotonic_runtime
+#define KMP_API_NAME_GOMP_TEAMS_REG GOMP_teams_reg
+#define KMP_API_NAME_GOMP_TASKWAIT_DEPEND GOMP_taskwait_depend
+
 #endif /* KMP_FTN_OS_H */
diff --git a/runtime/src/kmp_global.cpp b/runtime/src/kmp_global.cpp
index 6e636dc39..bfd145d17 100644
--- a/runtime/src/kmp_global.cpp
+++ b/runtime/src/kmp_global.cpp
@@ -16,7 +16,11 @@
 #include "kmp_dispatch_hier.h"
 #endif
 
+#if KMP_USE_ABT
+kmp_pth_key_t __kmp_gtid_threadprivate_key;
+#else
 kmp_key_t __kmp_gtid_threadprivate_key;
+#endif
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 kmp_cpuinfo_t __kmp_cpuinfo = {0}; // Not initialized
@@ -206,6 +210,13 @@ int __kmp_display_env = FALSE;
 int __kmp_display_env_verbose = FALSE;
 int __kmp_omp_cancellation = FALSE;
 
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+int __kmp_user_level_mwait = FALSE;
+int __kmp_umwait_enabled = FALSE;
+int __kmp_mwait_enabled = FALSE;
+int __kmp_mwait_hints = 0;
+#endif
+
 /* map OMP 3.0 schedule types with our internal schedule types */
 enum sched_type __kmp_sch_map[kmp_sched_upper - kmp_sched_lower_ext +
                               kmp_sched_upper_std - kmp_sched_lower - 2] = {
@@ -408,6 +419,11 @@ kmp_int32 __kmp_use_yield_exp_set = 0;
 kmp_uint32 __kmp_yield_init = KMP_INIT_WAIT;
 kmp_uint32 __kmp_yield_next = KMP_NEXT_WAIT;
 
+#if KMP_USE_ABT
+KMP_ALIGN_CACHE
+kmp_abt_global_t __kmp_abt_global;
+#endif
+
 /* ------------------------------------------------------ */
 /* STATE mostly syncronized with global lock */
 /* data written to rarely by masters, read often by workers */
@@ -415,6 +431,9 @@ kmp_uint32 __kmp_yield_next = KMP_NEXT_WAIT;
    of declaration is not necessarily correlated to storage order. To fix this,
    all the important globals must be put in a big structure instead. */
 KMP_ALIGN_CACHE
+#if KMP_REMOVE_FORKJOIN_LOCK
+kmp_bootstrap_lock_t __kmp_threads_lock;
+#endif
 kmp_info_t **__kmp_threads = NULL;
 kmp_root_t **__kmp_root = NULL;
 
@@ -422,7 +441,17 @@ kmp_root_t **__kmp_root = NULL;
 KMP_ALIGN_CACHE
 volatile int __kmp_nth = 0;
 volatile int __kmp_all_nth = 0;
+
+#if KMP_REMOVE_FORKJOIN_LOCK
+KMP_ALIGN_CACHE
+kmp_bootstrap_lock_t __kmp_thread_pool_lock;
+#endif
 volatile kmp_info_t *__kmp_thread_pool = NULL;
+
+#if KMP_REMOVE_FORKJOIN_LOCK
+KMP_ALIGN_CACHE
+kmp_bootstrap_lock_t __kmp_team_pool_lock;
+#endif
 volatile kmp_team_t *__kmp_team_pool = NULL;
 
 KMP_ALIGN_CACHE
diff --git a/runtime/src/kmp_gsupport.cpp b/runtime/src/kmp_gsupport.cpp
index 10841d265..11a35873f 100644
--- a/runtime/src/kmp_gsupport.cpp
+++ b/runtime/src/kmp_gsupport.cpp
@@ -17,6 +17,73 @@
 #include "ompt-specific.h"
 #endif
 
+enum {
+  KMP_GOMP_TASK_UNTIED_FLAG = 1,
+  KMP_GOMP_TASK_FINAL_FLAG = 2,
+  KMP_GOMP_TASK_DEPENDS_FLAG = 8
+};
+
+// This class helps convert gomp dependency info into
+// kmp_depend_info_t structures
+class kmp_gomp_depends_info_t {
+  void **depend;
+  kmp_int32 num_deps;
+  size_t num_out, num_mutexinout, num_in;
+  size_t offset;
+
+public:
+  kmp_gomp_depends_info_t(void **depend) : depend(depend) {
+    size_t ndeps = (kmp_intptr_t)depend[0];
+    size_t num_doable;
+    // GOMP taskdep structure:
+    // if depend[0] != 0:
+    // depend =  [ ndeps | nout | &out | ... | &out | &in | ... | &in ]
+    //
+    // if depend[0] == 0:
+    // depend = [ 0 | ndeps | nout | nmtx | nin | &out | ... | &out | &mtx |
+    //            ... | &mtx | &in   | ...  | &in  | &depobj | ... | &depobj ]
+    if (ndeps) {
+      num_out = (kmp_intptr_t)depend[1];
+      num_in = ndeps - num_out;
+      num_mutexinout = 0;
+      num_doable = ndeps;
+      offset = 2;
+    } else {
+      ndeps = (kmp_intptr_t)depend[1];
+      num_out = (kmp_intptr_t)depend[2];
+      num_mutexinout = (kmp_intptr_t)depend[3];
+      num_in = (kmp_intptr_t)depend[4];
+      num_doable = num_out + num_mutexinout + num_in;
+      offset = 5;
+    }
+    // TODO: Support gomp depobj
+    if (ndeps != num_doable) {
+      KMP_FATAL(GompFeatureNotSupported, "depobj");
+    }
+    num_deps = static_cast<kmp_int32>(ndeps);
+  }
+  kmp_int32 get_num_deps() const { return num_deps; }
+  kmp_depend_info_t get_kmp_depend(size_t index) const {
+    kmp_depend_info_t retval;
+    memset(&retval, '\0', sizeof(retval));
+    KMP_ASSERT(index < (size_t)num_deps);
+    retval.base_addr = (kmp_intptr_t)depend[offset + index];
+    retval.len = 0;
+    // Because inout and out are logically equivalent,
+    // use inout and in dependency flags. GOMP does not provide a
+    // way to distinguish if user specified out vs. inout.
+    if (index < num_out) {
+      retval.flags.in = 1;
+      retval.flags.out = 1;
+    } else if (index >= num_out && index < (num_out + num_mutexinout)) {
+      retval.flags.mtx = 1;
+    } else {
+      retval.flags.in = 1;
+    }
+    return retval;
+  }
+};
+
 #ifdef __cplusplus
 extern "C" {
 #endif // __cplusplus
@@ -35,8 +102,8 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_BARRIER)(void) {
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-    OMPT_STORE_RETURN_ADDRESS(gtid);
   }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
   __kmpc_barrier(&loc, gtid);
 #if OMPT_SUPPORT && OMPT_OPTIONAL
@@ -183,20 +250,20 @@ void *KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SINGLE_COPY_START)(void) {
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-    OMPT_STORE_RETURN_ADDRESS(gtid);
   }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
 
   // Retrieve the value of the copyprivate data point, and wait for all
   // threads to do likewise, then return.
   retval = __kmp_team_from_gtid(gtid)->t.t_copypriv_data;
+  {
 #if OMPT_SUPPORT && OMPT_OPTIONAL
-  if (ompt_enabled.enabled) {
     OMPT_STORE_RETURN_ADDRESS(gtid);
-  }
 #endif
-  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+    __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+  }
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.enabled) {
     ompt_frame->enter_frame = ompt_data_none;
@@ -219,16 +286,16 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SINGLE_COPY_END)(void *data) {
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-    OMPT_STORE_RETURN_ADDRESS(gtid);
   }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+  {
 #if OMPT_SUPPORT && OMPT_OPTIONAL
-  if (ompt_enabled.enabled) {
     OMPT_STORE_RETURN_ADDRESS(gtid);
-  }
 #endif
-  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+    __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+  }
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.enabled) {
     ompt_frame->enter_frame = ompt_data_none;
@@ -275,7 +342,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ORDERED_END)(void) {
 #define KMP_DISPATCH_FINI_CHUNK_ULL __kmp_aux_dispatch_fini_chunk_8u
 #define KMP_DISPATCH_NEXT_ULL __kmpc_dispatch_next_8u
 
-// The parallel contruct
+// The parallel construct
 
 #ifndef KMP_DEBUG
 static
@@ -325,7 +392,7 @@ static
                                           enum sched_type schedule, long start,
                                           long end, long incr,
                                           long chunk_size) {
-  // Intialize the loop worksharing construct.
+  // Initialize the loop worksharing construct.
 
   KMP_DISPATCH_INIT(loc, *gtid, schedule, start, end, incr, chunk_size,
                     schedule != kmp_sch_static);
@@ -361,12 +428,9 @@ static
 #endif
 }
 
-#ifndef KMP_DEBUG
-static
-#endif /* KMP_DEBUG */
-    void
-    __kmp_GOMP_fork_call(ident_t *loc, int gtid, void (*unwrapped_task)(void *),
-                         microtask_t wrapper, int argc, ...) {
+static void __kmp_GOMP_fork_call(ident_t *loc, int gtid, unsigned num_threads,
+                                 unsigned flags, void (*unwrapped_task)(void *),
+                                 microtask_t wrapper, int argc, ...) {
   int rc;
   kmp_info_t *thr = __kmp_threads[gtid];
   kmp_team_t *team = thr->th.th_team;
@@ -375,14 +439,12 @@ static
   va_list ap;
   va_start(ap, argc);
 
+  if (num_threads != 0)
+    __kmp_push_num_threads(loc, gtid, num_threads);
+  if (flags != 0)
+    __kmp_push_proc_bind(loc, gtid, (kmp_proc_bind_t)flags);
   rc = __kmp_fork_call(loc, gtid, fork_context_gnu, argc, wrapper,
-                       __kmp_invoke_task_func,
-#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
-                       &ap
-#else
-                       ap
-#endif
-                       );
+                       __kmp_invoke_task_func, kmp_va_addr_of(ap));
 
   va_end(ap);
 
@@ -409,14 +471,6 @@ static
 #endif
 }
 
-static void __kmp_GOMP_serialized_parallel(ident_t *loc, kmp_int32 gtid,
-                                           void (*task)(void *)) {
-#if OMPT_SUPPORT
-  OMPT_STORE_RETURN_ADDRESS(gtid);
-#endif
-  __kmp_serialized_parallel(loc, gtid);
-}
-
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_START)(void (*task)(void *),
                                                        void *data,
                                                        unsigned num_threads) {
@@ -428,24 +482,15 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_START)(void (*task)(void *),
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &parent_frame, NULL, NULL);
     parent_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-    OMPT_STORE_RETURN_ADDRESS(gtid);
   }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
   MKLOC(loc, "GOMP_parallel_start");
   KA_TRACE(20, ("GOMP_parallel_start: T#%d\n", gtid));
-
-  if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
-    if (num_threads != 0) {
-      __kmp_push_num_threads(&loc, gtid, num_threads);
-    }
-    __kmp_GOMP_fork_call(&loc, gtid, task,
-                         (microtask_t)__kmp_GOMP_microtask_wrapper, 2, task,
-                         data);
-  } else {
-    __kmp_GOMP_serialized_parallel(&loc, gtid, task);
-  }
-
+  __kmp_GOMP_fork_call(&loc, gtid, num_threads, 0u, task,
+                       (microtask_t)__kmp_GOMP_microtask_wrapper, 2, task,
+                       data);
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &frame, NULL, NULL);
@@ -466,25 +511,22 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)(void) {
   if (!thr->th.th_team->t.t_serialized) {
     __kmp_run_after_invoked_task(gtid, __kmp_tid_from_gtid(gtid), thr,
                                  thr->th.th_team);
-
+  }
 #if OMPT_SUPPORT
-    if (ompt_enabled.enabled) {
-      // Implicit task is finished here, in the barrier we might schedule
-      // deferred tasks,
-      // these don't see the implicit task on the stack
-      OMPT_CUR_TASK_INFO(thr)->frame.exit_frame = ompt_data_none;
-    }
+  if (ompt_enabled.enabled) {
+    // Implicit task is finished here, in the barrier we might schedule
+    // deferred tasks,
+    // these don't see the implicit task on the stack
+    OMPT_CUR_TASK_INFO(thr)->frame.exit_frame = ompt_data_none;
+  }
 #endif
 
-    __kmp_join_call(&loc, gtid
+  __kmp_join_call(&loc, gtid
 #if OMPT_SUPPORT
-                    ,
-                    fork_context_gnu
+                  ,
+                  fork_context_gnu
 #endif
-                    );
-  } else {
-    __kmpc_end_serialized_parallel(&loc, gtid);
-  }
+                  );
 }
 
 // Loop worksharing constructs
@@ -495,8 +537,8 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)(void) {
 // argument to __kmp_GOMP_fork_call).
 //
 // Conversely, KMP_DISPATCH_NEXT returns and inclusive upper bound in *p_ub,
-// but the Gnu codegen expects an excluside upper bound, so the adjustment
-// "*p_ub += stride" compenstates for the discrepancy.
+// but the Gnu codegen expects an exclusive upper bound, so the adjustment
+// "*p_ub += stride" compensates for the discrepancy.
 //
 // Correction: the gnu codegen always adjusts the upper bound by +-1, not the
 // stride value.  We adjust the dispatch parameters accordingly (by +-1), but
@@ -531,13 +573,17 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)(void) {
          gtid, lb, ub, str, chunk_sz));                                        \
                                                                                \
     if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
-      IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
-      KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                            \
-                        (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,        \
-                        (schedule) != kmp_sch_static);                         \
-      IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
-      status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,            \
-                                 (kmp_int *)p_ub, (kmp_int *)&stride);         \
+      {                                                                        \
+        IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                      \
+        KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                          \
+                          (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,      \
+                          (schedule) != kmp_sch_static);                       \
+      }                                                                        \
+      {                                                                        \
+        IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                      \
+        status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,          \
+                                   (kmp_int *)p_ub, (kmp_int *)&stride);       \
+      }                                                                        \
       if (status) {                                                            \
         KMP_DEBUG_ASSERT(stride == str);                                       \
         *p_ub += (str > 0) ? 1 : -1;                                           \
@@ -567,12 +613,17 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)(void) {
          gtid, lb, ub, str, chunk_sz));                                        \
                                                                                \
     if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
-      IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
-      KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                            \
-                        (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz, TRUE); \
-      IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
-      status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,            \
-                                 (kmp_int *)p_ub, (kmp_int *)&stride);         \
+      {                                                                        \
+        IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                      \
+        KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                          \
+                          (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,      \
+                          TRUE);                                               \
+      }                                                                        \
+      {                                                                        \
+        IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                      \
+        status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,          \
+                                   (kmp_int *)p_ub, (kmp_int *)&stride);       \
+      }                                                                        \
       if (status) {                                                            \
         KMP_DEBUG_ASSERT(stride == str);                                       \
         *p_ub += (str > 0) ? 1 : -1;                                           \
@@ -635,6 +686,15 @@ LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_GUIDED_NEXT), {})
 LOOP_RUNTIME_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_RUNTIME_START),
                    kmp_sch_runtime)
 LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_RUNTIME_NEXT), {})
+LOOP_RUNTIME_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_START),
+    kmp_sch_runtime)
+LOOP_RUNTIME_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_START),
+    kmp_sch_runtime)
+LOOP_NEXT(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_NEXT), {})
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_NEXT), {})
 
 LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_START),
            kmp_ord_static)
@@ -911,6 +971,18 @@ LOOP_NEXT_ULL(
 LOOP_RUNTIME_START_ULL(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_START), kmp_sch_runtime)
 LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_NEXT), {})
+LOOP_RUNTIME_START_ULL(
+    KMP_EXPAND_NAME(
+        KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_START),
+    kmp_sch_runtime)
+LOOP_RUNTIME_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_START),
+    kmp_sch_runtime)
+LOOP_NEXT_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_NEXT),
+    {})
+LOOP_NEXT_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_NEXT), {})
 
 LOOP_START_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_START),
                kmp_ord_static)
@@ -1058,19 +1130,11 @@ LOOP_DOACROSS_RUNTIME_START_ULL(
                                                                                \
     ompt_pre();                                                                \
                                                                                \
-    if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {                       \
-      if (num_threads != 0) {                                                  \
-        __kmp_push_num_threads(&loc, gtid, num_threads);                       \
-      }                                                                        \
-      __kmp_GOMP_fork_call(&loc, gtid, task,                                   \
-                           (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, \
-                           9, task, data, num_threads, &loc, (schedule), lb,   \
-                           (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz);    \
-      IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid));                        \
-    } else {                                                                   \
-      __kmp_GOMP_serialized_parallel(&loc, gtid, task);                        \
-      IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid));                        \
-    }                                                                          \
+    __kmp_GOMP_fork_call(&loc, gtid, num_threads, 0u, task,                    \
+                         (microtask_t)__kmp_GOMP_parallel_microtask_wrapper,   \
+                         9, task, data, num_threads, &loc, (schedule), lb,     \
+                         (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz);      \
+    IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid));                          \
                                                                                \
     KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                              \
                       (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,          \
@@ -1132,11 +1196,11 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASK)(void (*func)(void *), void *data,
   KA_TRACE(20, ("GOMP_task: T#%d\n", gtid));
 
   // The low-order bit is the "untied" flag
-  if (!(gomp_flags & 1)) {
+  if (!(gomp_flags & KMP_GOMP_TASK_UNTIED_FLAG)) {
     input_flags->tiedness = 1;
   }
   // The second low-order bit is the "final" flag
-  if (gomp_flags & 2) {
+  if (gomp_flags & KMP_GOMP_TASK_FINAL_FLAG) {
     input_flags->final = 1;
   }
   input_flags->native = 1;
@@ -1167,26 +1231,23 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASK)(void (*func)(void *), void *data,
 #if OMPT_SUPPORT
   kmp_taskdata_t *current_task;
   if (ompt_enabled.enabled) {
-    OMPT_STORE_RETURN_ADDRESS(gtid);
     current_task = __kmp_threads[gtid]->th.th_current_task;
     current_task->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
   }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
   if (if_cond) {
-    if (gomp_flags & 8) {
+    if (gomp_flags & KMP_GOMP_TASK_DEPENDS_FLAG) {
       KMP_ASSERT(depend);
-      const size_t ndeps = (kmp_intptr_t)depend[0];
-      const size_t nout = (kmp_intptr_t)depend[1];
+      kmp_gomp_depends_info_t gomp_depends(depend);
+      kmp_int32 ndeps = gomp_depends.get_num_deps();
       kmp_depend_info_t dep_list[ndeps];
-
-      for (size_t i = 0U; i < ndeps; i++) {
-        dep_list[i].base_addr = (kmp_intptr_t)depend[2U + i];
-        dep_list[i].len = 0U;
-        dep_list[i].flags.in = 1;
-        dep_list[i].flags.out = (i < nout);
-      }
-      __kmpc_omp_task_with_deps(&loc, gtid, task, ndeps, dep_list, 0, NULL);
+      for (kmp_int32 i = 0; i < ndeps; i++)
+        dep_list[i] = gomp_depends.get_kmp_depend(i);
+      kmp_int32 ndeps_cnv;
+      __kmp_type_convert(ndeps, &ndeps_cnv);
+      __kmpc_omp_task_with_deps(&loc, gtid, task, ndeps_cnv, dep_list, 0, NULL);
     } else {
       __kmpc_omp_task(&loc, gtid, task);
     }
@@ -1203,9 +1264,18 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASK)(void (*func)(void *), void *data,
       thread->th.ompt_thread_info.wait_id = 0;
       thread->th.ompt_thread_info.state = ompt_state_work_parallel;
       taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-      OMPT_STORE_RETURN_ADDRESS(gtid);
     }
+    OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
+    if (gomp_flags & KMP_GOMP_TASK_DEPENDS_FLAG) {
+      KMP_ASSERT(depend);
+      kmp_gomp_depends_info_t gomp_depends(depend);
+      kmp_int32 ndeps = gomp_depends.get_num_deps();
+      kmp_depend_info_t dep_list[ndeps];
+      for (kmp_int32 i = 0; i < ndeps; i++)
+        dep_list[i] = gomp_depends.get_kmp_depend(i);
+      __kmpc_omp_wait_deps(&loc, gtid, ndeps, dep_list, 0, NULL);
+    }
 
     __kmpc_omp_task_begin_if0(&loc, gtid, task);
     func(data);
@@ -1232,8 +1302,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKWAIT)(void) {
   int gtid = __kmp_entry_gtid();
 
 #if OMPT_SUPPORT
-  if (ompt_enabled.enabled)
-    OMPT_STORE_RETURN_ADDRESS(gtid);
+  OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
   KA_TRACE(20, ("GOMP_taskwait: T#%d\n", gtid));
@@ -1310,24 +1379,17 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_SECTIONS_START)(
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &parent_frame, NULL, NULL);
     parent_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-    OMPT_STORE_RETURN_ADDRESS(gtid);
   }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
   MKLOC(loc, "GOMP_parallel_sections_start");
   KA_TRACE(20, ("GOMP_parallel_sections_start: T#%d\n", gtid));
 
-  if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
-    if (num_threads != 0) {
-      __kmp_push_num_threads(&loc, gtid, num_threads);
-    }
-    __kmp_GOMP_fork_call(&loc, gtid, task,
-                         (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9,
-                         task, data, num_threads, &loc, kmp_nm_dynamic_chunked,
-                         (kmp_int)1, (kmp_int)count, (kmp_int)1, (kmp_int)1);
-  } else {
-    __kmp_GOMP_serialized_parallel(&loc, gtid, task);
-  }
+  __kmp_GOMP_fork_call(&loc, gtid, num_threads, 0u, task,
+                       (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9,
+                       task, data, num_threads, &loc, kmp_nm_dynamic_chunked,
+                       (kmp_int)1, (kmp_int)count, (kmp_int)1, (kmp_int)1);
 
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
@@ -1349,8 +1411,8 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS_END)(void) {
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-    OMPT_STORE_RETURN_ADDRESS(gtid);
   }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
 #if OMPT_SUPPORT
@@ -1385,22 +1447,12 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL)(void (*task)(void *),
   if (ompt_enabled.enabled) {
     parent_task_info = __ompt_get_task_info_object(0);
     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
-    OMPT_STORE_RETURN_ADDRESS(gtid);
   }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
-  if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
-    if (num_threads != 0) {
-      __kmp_push_num_threads(&loc, gtid, num_threads);
-    }
-    if (flags != 0) {
-      __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags);
-    }
-    __kmp_GOMP_fork_call(&loc, gtid, task,
-                         (microtask_t)__kmp_GOMP_microtask_wrapper, 2, task,
-                         data);
-  } else {
-    __kmp_GOMP_serialized_parallel(&loc, gtid, task);
-  }
+  __kmp_GOMP_fork_call(&loc, gtid, num_threads, flags, task,
+                       (microtask_t)__kmp_GOMP_microtask_wrapper, 2, task,
+                       data);
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     task_info = __ompt_get_task_info_object(0);
@@ -1408,12 +1460,12 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL)(void (*task)(void *),
   }
 #endif
   task(data);
+  {
 #if OMPT_SUPPORT
-  if (ompt_enabled.enabled) {
     OMPT_STORE_RETURN_ADDRESS(gtid);
-  }
 #endif
-  KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)();
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)();
+  }
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     task_info->frame.exit_frame = ompt_data_none;
@@ -1435,27 +1487,18 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task)(void *),
   OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
-  if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
-    if (num_threads != 0) {
-      __kmp_push_num_threads(&loc, gtid, num_threads);
-    }
-    if (flags != 0) {
-      __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags);
-    }
-    __kmp_GOMP_fork_call(&loc, gtid, task,
-                         (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9,
-                         task, data, num_threads, &loc, kmp_nm_dynamic_chunked,
-                         (kmp_int)1, (kmp_int)count, (kmp_int)1, (kmp_int)1);
-  } else {
-    __kmp_GOMP_serialized_parallel(&loc, gtid, task);
-  }
+  __kmp_GOMP_fork_call(&loc, gtid, num_threads, flags, task,
+                       (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9,
+                       task, data, num_threads, &loc, kmp_nm_dynamic_chunked,
+                       (kmp_int)1, (kmp_int)count, (kmp_int)1, (kmp_int)1);
 
+  {
 #if OMPT_SUPPORT
   OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
   KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
-
+  }
   task(data);
   KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)();
   KA_TRACE(20, ("GOMP_parallel_sections exit: T#%d\n", gtid));
@@ -1473,25 +1516,18 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task)(void *),
          gtid, lb, ub, str, chunk_sz));                                        \
                                                                                \
     ompt_pre();                                                                \
-    if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {                       \
-      if (num_threads != 0) {                                                  \
-        __kmp_push_num_threads(&loc, gtid, num_threads);                       \
-      }                                                                        \
-      if (flags != 0) {                                                        \
-        __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags);              \
-      }                                                                        \
-      __kmp_GOMP_fork_call(&loc, gtid, task,                                   \
-                           (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, \
-                           9, task, data, num_threads, &loc, (schedule), lb,   \
-                           (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz);    \
-    } else {                                                                   \
-      __kmp_GOMP_serialized_parallel(&loc, gtid, task);                        \
-    }                                                                          \
-                                                                               \
     IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                          \
-    KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                              \
-                      (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,          \
-                      (schedule) != kmp_sch_static);                           \
+    __kmp_GOMP_fork_call(&loc, gtid, num_threads, flags, task,                 \
+                         (microtask_t)__kmp_GOMP_parallel_microtask_wrapper,   \
+                         9, task, data, num_threads, &loc, (schedule), lb,     \
+                         (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz);      \
+                                                                               \
+    {                                                                          \
+      IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
+      KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                            \
+                        (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,        \
+                        (schedule) != kmp_sch_static);                         \
+    }                                                                          \
     task(data);                                                                \
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)();                         \
     ompt_post();                                                               \
@@ -1513,6 +1549,12 @@ PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED),
               kmp_sch_guided_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
 PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME),
               kmp_sch_runtime, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_MAYBE_NONMONOTONIC_RUNTIME),
+    kmp_sch_runtime, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_RUNTIME),
+    kmp_sch_runtime, OMPT_LOOP_PRE, OMPT_LOOP_POST)
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_START)(void) {
   int gtid = __kmp_entry_gtid();
@@ -1520,8 +1562,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_START)(void) {
   KA_TRACE(20, ("GOMP_taskgroup_start: T#%d\n", gtid));
 
 #if OMPT_SUPPORT
-  if (ompt_enabled.enabled)
-    OMPT_STORE_RETURN_ADDRESS(gtid);
+  OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
   __kmpc_taskgroup(&loc, gtid);
@@ -1535,8 +1576,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_END)(void) {
   KA_TRACE(20, ("GOMP_taskgroup_end: T#%d\n", gtid));
 
 #if OMPT_SUPPORT
-  if (ompt_enabled.enabled)
-    OMPT_STORE_RETURN_ADDRESS(gtid);
+  OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
   __kmpc_end_taskgroup(&loc, gtid);
@@ -1743,7 +1783,7 @@ void __GOMP_taskloop(void (*func)(void *), void *data,
 
 // 4 byte version of GOMP_doacross_post
 // This verison needs to create a temporary array which converts 4 byte
-// integers into 8 byte integeres
+// integers into 8 byte integers
 template <typename T, bool need_conversion = (sizeof(long) == 4)>
 void __kmp_GOMP_doacross_post(T *count);
 
@@ -1752,8 +1792,8 @@ template <> void __kmp_GOMP_doacross_post<long, true>(long *count) {
   kmp_info_t *th = __kmp_threads[gtid];
   MKLOC(loc, "GOMP_doacross_post");
   kmp_int64 num_dims = th->th.th_dispatch->th_doacross_info[0];
-  kmp_int64 *vec =
-      (kmp_int64 *)__kmp_thread_malloc(th, sizeof(kmp_int64) * num_dims);
+  kmp_int64 *vec = (kmp_int64 *)__kmp_thread_malloc(
+      th, (size_t)(sizeof(kmp_int64) * num_dims));
   for (kmp_int64 i = 0; i < num_dims; ++i) {
     vec[i] = (kmp_int64)count[i];
   }
@@ -1775,8 +1815,8 @@ template <typename T> void __kmp_GOMP_doacross_wait(T first, va_list args) {
   kmp_info_t *th = __kmp_threads[gtid];
   MKLOC(loc, "GOMP_doacross_wait");
   kmp_int64 num_dims = th->th.th_dispatch->th_doacross_info[0];
-  kmp_int64 *vec =
-      (kmp_int64 *)__kmp_thread_malloc(th, sizeof(kmp_int64) * num_dims);
+  kmp_int64 *vec = (kmp_int64 *)__kmp_thread_malloc(
+      th, (size_t)(sizeof(kmp_int64) * num_dims));
   vec[0] = (kmp_int64)first;
   for (kmp_int64 i = 1; i < num_dims; ++i) {
     T item = va_arg(args, T);
@@ -1835,6 +1875,41 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_DOACROSS_ULL_WAIT)(
   va_end(args);
 }
 
+// fn: the function each master thread of new team will call
+// data: argument to fn
+// num_teams, thread_limit: max bounds on respective ICV
+// flags: unused
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TEAMS_REG)(void (*fn)(void *),
+                                                  void *data,
+                                                  unsigned num_teams,
+                                                  unsigned thread_limit,
+                                                  unsigned flags) {
+  MKLOC(loc, "GOMP_teams_reg");
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_teams_reg: T#%d num_teams=%u thread_limit=%u flag=%u\n",
+                gtid, num_teams, thread_limit, flags));
+  __kmpc_push_num_teams(&loc, gtid, num_teams, thread_limit);
+  __kmpc_fork_teams(&loc, 2, (microtask_t)__kmp_GOMP_microtask_wrapper, fn,
+                    data);
+  KA_TRACE(20, ("GOMP_teams_reg exit: T#%d\n", gtid));
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKWAIT_DEPEND)(void **depend) {
+  MKLOC(loc, "GOMP_taskwait_depend");
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_taskwait_depend: T#%d\n", gtid));
+  kmp_gomp_depends_info_t gomp_depends(depend);
+  kmp_int32 ndeps = gomp_depends.get_num_deps();
+  kmp_depend_info_t dep_list[ndeps];
+  for (kmp_int32 i = 0; i < ndeps; i++)
+    dep_list[i] = gomp_depends.get_kmp_depend(i);
+#if OMPT_SUPPORT
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_omp_wait_deps(&loc, gtid, ndeps, dep_list, 0, NULL);
+  KA_TRACE(20, ("GOMP_taskwait_depend exit: T#%d\n", gtid));
+}
+
 /* The following sections of code create aliases for the GOMP_* functions, then
    create versioned symbols using the assembler directive .symver. This is only
    pertinent for ELF .so library. The KMP_VERSION_SYMBOL macro is defined in
@@ -1985,6 +2060,30 @@ KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_DYNAMIC, 45,
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_GUIDED, 45,
                    "GOMP_4.5");
 
+// GOMP_5.0 versioned symbols
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_NEXT, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_START, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_NEXT, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_START, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_NEXT,
+                   50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_START,
+                   50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_NEXT, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_START, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_RUNTIME, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_MAYBE_NONMONOTONIC_RUNTIME,
+                   50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TEAMS_REG, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKWAIT_DEPEND, 50, "GOMP_5.0");
+
 #endif // KMP_USE_VERSION_SYMBOLS
 
 #ifdef __cplusplus
diff --git a/runtime/src/kmp_i18n.cpp b/runtime/src/kmp_i18n.cpp
index 53c442715..d2651cfab 100644
--- a/runtime/src/kmp_i18n.cpp
+++ b/runtime/src/kmp_i18n.cpp
@@ -639,7 +639,7 @@ kmp_msg_t __kmp_msg_format(unsigned id_arg, ...) {
   // numbers, for example:  "%2$s %1$s".
   __kmp_str_buf_vprint(&buffer, __kmp_i18n_catgets(id), args);
 #elif KMP_OS_WINDOWS
-  // On Winodws, printf() family functions does not recognize GNU style
+  // On Windows, printf() family functions does not recognize GNU style
   // parameter numbers, so we have to use FormatMessage() instead. It recognizes
   // parameter numbers, e. g.:  "%2!s! "%1!s!".
   {
diff --git a/runtime/src/kmp_i18n.h b/runtime/src/kmp_i18n.h
index 9d79a21bb..c3f21d6a5 100644
--- a/runtime/src/kmp_i18n.h
+++ b/runtime/src/kmp_i18n.h
@@ -32,7 +32,7 @@ extern "C" {
 
    __kmp_i18n_catgets() returns read-only string. It should not be freed.
 
-   KMP_I18N_STR macro simplifies acces to strings in message catalog a bit.
+   KMP_I18N_STR macro simplifies access to strings in message catalog a bit.
    Following two lines are equivalent:
 
    __kmp_i18n_catgets( kmp_i18n_str_Warning )
@@ -103,7 +103,7 @@ struct kmp_msg {
   kmp_msg_type_t type;
   int num;
   char *str;
-  int len;
+  size_t len;
 }; // struct kmp_message
 typedef struct kmp_msg kmp_msg_t;
 
diff --git a/runtime/src/kmp_itt.inl b/runtime/src/kmp_itt.inl
index 6e37ce0f0..e7c6041d6 100644
--- a/runtime/src/kmp_itt.inl
+++ b/runtime/src/kmp_itt.inl
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 // Inline function definitions. This file should be included into kmp_itt.h file
-// for production build (to let compliler inline functions) or into kmp_itt.c
+// for production build (to let compiler inline functions) or into kmp_itt.c
 // file for debug build (to reduce the number of files to recompile and save
 // build time).
 
@@ -115,7 +115,8 @@ LINKAGE void __kmp_itt_region_forking(int gtid, int team_size, int barriers) {
         // that the tools more or less standardized on:
         //   "<func>$omp$parallel@[file:]<line>[:<col>]"
         char *buff = NULL;
-        kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 1);
+        kmp_str_loc_t str_loc =
+            __kmp_str_loc_init(loc->psource, /* init_fname */ false);
         buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d", str_loc.func,
                                 team_size, str_loc.file, str_loc.line,
                                 str_loc.col);
@@ -155,7 +156,8 @@ LINKAGE void __kmp_itt_region_forking(int gtid, int team_size, int barriers) {
       if ((frm < KMP_MAX_FRAME_DOMAINS) &&
           (__kmp_itt_region_team_size[frm] != team_size)) {
         char *buff = NULL;
-        kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 1);
+        kmp_str_loc_t str_loc = 
+            __kmp_str_loc_init(loc->psource, /* init_fname */ false);
         buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d", str_loc.func,
                                 team_size, str_loc.file, str_loc.line,
                                 str_loc.col);
@@ -212,7 +214,8 @@ LINKAGE void __kmp_itt_frame_submit(int gtid, __itt_timestamp begin,
         // that the tools more or less standardized on:
         //   "<func>$omp$parallel:team_size@[file:]<line>[:<col>]"
         char *buff = NULL;
-        kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 1);
+        kmp_str_loc_t str_loc = 
+            __kmp_str_loc_init(loc->psource, /* init_fname */ false);
         buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d", str_loc.func,
                                 team_size, str_loc.file, str_loc.line,
                                 str_loc.col);
@@ -230,10 +233,12 @@ LINKAGE void __kmp_itt_frame_submit(int gtid, __itt_timestamp begin,
       // Check if team size was changed. Then create new region domain for this
       // location
       unsigned int frm = (loc->reserved_2 & 0x0000FFFF) - 1;
-      if ((frm < KMP_MAX_FRAME_DOMAINS) &&
-          (__kmp_itt_region_team_size[frm] != team_size)) {
+      if (frm >= KMP_MAX_FRAME_DOMAINS)
+        return; // something's gone wrong, returning
+      if (__kmp_itt_region_team_size[frm] != team_size) {
         char *buff = NULL;
-        kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 1);
+        kmp_str_loc_t str_loc = 
+            __kmp_str_loc_init(loc->psource, /* init_fname */ false);
         buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d", str_loc.func,
                                 team_size, str_loc.file, str_loc.line,
                                 str_loc.col);
@@ -272,7 +277,8 @@ LINKAGE void __kmp_itt_frame_submit(int gtid, __itt_timestamp begin,
           // Transform compiler-generated region location into the format
           // that the tools more or less standardized on:
           //   "<func>$omp$frame@[file:]<line>[:<col>]"
-          kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 1);
+          kmp_str_loc_t str_loc = 
+              __kmp_str_loc_init(loc->psource, /* init_fname */ false);
           if (imbalance) {
             char *buff_imb = NULL;
             buff_imb = __kmp_str_format("%s$omp$barrier-imbalance:%d@%s:%d",
@@ -364,25 +370,12 @@ LINKAGE void __kmp_itt_metadata_loop(ident_t *loc, kmp_uint64 sched_type,
   }
 
   // Parse line and column from psource string: ";file;func;line;col;;"
-  char *s_line;
-  char *s_col;
   KMP_DEBUG_ASSERT(loc->psource);
-#ifdef __cplusplus
-  s_line = strchr(CCAST(char *, loc->psource), ';');
-#else
-  s_line = strchr(loc->psource, ';');
-#endif
-  KMP_DEBUG_ASSERT(s_line);
-  s_line = strchr(s_line + 1, ';'); // 2-nd semicolon
-  KMP_DEBUG_ASSERT(s_line);
-  s_line = strchr(s_line + 1, ';'); // 3-rd semicolon
-  KMP_DEBUG_ASSERT(s_line);
-  s_col = strchr(s_line + 1, ';'); // 4-th semicolon
-  KMP_DEBUG_ASSERT(s_col);
-
   kmp_uint64 loop_data[5];
-  loop_data[0] = atoi(s_line + 1); // read line
-  loop_data[1] = atoi(s_col + 1); // read column
+  int line, col;
+  __kmp_str_loc_numbers(loc->psource, &line, &col);
+  loop_data[0] = line;
+  loop_data[1] = col;
   loop_data[2] = sched_type;
   loop_data[3] = iterations;
   loop_data[4] = chunk;
@@ -408,12 +401,11 @@ LINKAGE void __kmp_itt_metadata_single(ident_t *loc) {
     __kmp_release_bootstrap_lock(&metadata_lock);
   }
 
-  kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 1);
+  int line, col;
+  __kmp_str_loc_numbers(loc->psource, &line, &col);
   kmp_uint64 single_data[2];
-  single_data[0] = str_loc.line;
-  single_data[1] = str_loc.col;
-
-  __kmp_str_loc_free(&str_loc);
+  single_data[0] = line;
+  single_data[1] = col;
 
   __itt_metadata_add(metadata_domain, __itt_null, string_handle_sngl,
                      __itt_metadata_u64, 2, single_data);
@@ -474,7 +466,7 @@ LINKAGE void __kmp_itt_region_joined(int gtid) {
    ITT need an address (void *) to be specified as a sync object. OpenMP RTL
    does not have barrier object or barrier data structure. Barrier is just a
    counter in team and thread structures. We could use an address of team
-   structure as an barrier sync object, but ITT wants different objects for
+   structure as a barrier sync object, but ITT wants different objects for
    different barriers (even whithin the same team). So let us use team address
    as barrier sync object for the first barrier, then increase it by one for the
    next barrier, and so on (but wrap it not to use addresses outside of team
@@ -502,13 +494,14 @@ void *__kmp_itt_barrier_object(int gtid, int bt, int set_name,
     // Now form the barrier id. Encode barrier type (bt) in barrier id too, so
     // barriers of different types do not have the same ids.
     KMP_BUILD_ASSERT(sizeof(kmp_team_t) >= bs_last_barrier);
-    // This conditon is a must (we would have zero divide otherwise).
+    // This condition is a must (we would have zero divide otherwise).
     KMP_BUILD_ASSERT(sizeof(kmp_team_t) >= 2 * bs_last_barrier);
     // More strong condition: make sure we have room at least for for two
-    // differtent ids (for each barrier type).
+    // different ids (for each barrier type).
     object = reinterpret_cast<void *>(
-        kmp_uintptr_t(team) +
-        counter % (sizeof(kmp_team_t) / bs_last_barrier) * bs_last_barrier +
+        (kmp_uintptr_t)(team) +
+        (kmp_uintptr_t)counter % (sizeof(kmp_team_t) / bs_last_barrier) *
+            bs_last_barrier +
         bt);
     KMP_ITT_DEBUG_LOCK();
     KMP_ITT_DEBUG_PRINT("[bar obj] type=%d, counter=%lld, object=%p\n", bt,
@@ -629,7 +622,7 @@ void __kmp_itt_barrier_finished(int gtid, void *object) {
 void *__kmp_itt_taskwait_object(int gtid) {
   void *object = NULL;
 #if USE_ITT_NOTIFY
-  if (__itt_sync_create_ptr) {
+  if (UNLIKELY(__itt_sync_create_ptr)) {
     kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
     kmp_taskdata_t *taskdata = thread->th.th_current_task;
     object = reinterpret_cast<void *>(kmp_uintptr_t(taskdata) +
@@ -676,7 +669,7 @@ void __kmp_itt_task_starting(
     void *object // ITT sync object: barrier or taskwait.
     ) {
 #if USE_ITT_NOTIFY
-  if (object != NULL) {
+  if (UNLIKELY(object != NULL)) {
     KMP_ITT_DEBUG_LOCK();
     __itt_sync_cancel(object);
     KMP_ITT_DEBUG_PRINT("[tsk sta] scan( %p )\n", object);
diff --git a/runtime/src/kmp_lock.cpp b/runtime/src/kmp_lock.cpp
index 2cc9e0827..f13630b36 100644
--- a/runtime/src/kmp_lock.cpp
+++ b/runtime/src/kmp_lock.cpp
@@ -39,6 +39,242 @@
 #endif
 #endif
 
+#if KMP_USE_ABT
+
+void __kmp_validate_locks(void) {
+  ;// Empty.
+}
+
+// kmp_base_xxx_lock_t must be larger than 64 bytes to avoid unintentional
+// inlining (See comments by grepping "__kmp_base_user_lock_size".)
+#define KMP_LOCK_STATIC static inline __attribute__((unused))
+#define KMP_DEFINE_LOCKS(locktype)                                             \
+typedef struct { char _[64]; } kmp_base_ ## locktype ## _lock_t;               \
+KMP_LOCK_STATIC int __kmp_is_ ## locktype ## _lock_initialized                 \
+             (kmp_ ## locktype ## _lock_t *lck) {                              \
+  return lck == lck->initialized;                                              \
+}                                                                              \
+int __kmp_acquire_ ## locktype ## _lock                                        \
+      (kmp_ ## locktype ## _lock_t *lck, kmp_int32 gtid) {                     \
+  while (KMP_XCHG_FIXED8(&lck->lock, 1)) {                                     \
+    while (*((volatile char *)&lck->lock) == 1) {                              \
+      if (ABT_initialized() == ABT_SUCCESS)                                    \
+        ABT_thread_yield();                                                    \
+    }                                                                          \
+    KMP_MB();                                                                  \
+  }                                                                            \
+  return 0;                                                                    \
+}                                                                              \
+int __kmp_test_ ## locktype ## _lock                                           \
+      (kmp_ ## locktype ## _lock_t *lck, kmp_int32 gtid) {                     \
+  if (KMP_XCHG_FIXED8(&lck->lock, 1)) {                                        \
+    return FALSE;                                                              \
+  } else {                                                                     \
+    return TRUE;                                                               \
+  }                                                                            \
+}                                                                              \
+int __kmp_release_ ## locktype ## _lock                                        \
+      (kmp_ ## locktype ## _lock_t *lck, kmp_int32 gtid) {                     \
+  KMP_MB();                                                                    \
+  lck->lock = 0;                                                               \
+  KMP_MB();                                                                    \
+  return 0;                                                                    \
+}                                                                              \
+void __kmp_init_ ## locktype ## _lock                                          \
+       (kmp_ ## locktype ## _lock_t *lck) {                                    \
+  lck->initialized = lck;                                                      \
+  lck->owner_gtid = -2;                                                        \
+  lck->nest_level = 0;                                                         \
+  lck->lock = 0;                                                               \
+  lck->location = NULL;                                                        \
+  lck->flags = 0;                                                              \
+}                                                                              \
+void __kmp_destroy_ ## locktype ## _lock                                       \
+       (kmp_ ## locktype ## _lock_t *lck) {                                    \
+  lck->initialized = NULL;                                                     \
+}                                                                              \
+KMP_LOCK_STATIC int __kmp_acquire_ ## locktype ## _lock_with_checks            \
+             (kmp_ ## locktype ## _lock_t *lck, kmp_int32 gtid) {              \
+  return __kmp_acquire_ ## locktype ## _lock(lck, gtid);                       \
+}                                                                              \
+KMP_LOCK_STATIC int __kmp_test_ ## locktype ## _lock_with_checks               \
+             (kmp_ ## locktype ## _lock_t *lck, kmp_int32 gtid) {              \
+  return __kmp_test_ ## locktype ## _lock(lck, gtid);                          \
+}                                                                              \
+KMP_LOCK_STATIC int __kmp_release_ ## locktype ## _lock_with_checks            \
+             (kmp_ ## locktype ## _lock_t *lck, kmp_int32 gtid) {              \
+  return __kmp_release_ ## locktype ## _lock(lck, gtid);                       \
+}                                                                              \
+KMP_LOCK_STATIC void __kmp_init_ ## locktype ## _lock_with_checks              \
+              (kmp_ ## locktype ## _lock_t *lck) {                             \
+  __kmp_init_ ## locktype ## _lock(lck);                                       \
+}                                                                              \
+KMP_LOCK_STATIC void __kmp_destroy_ ## locktype ## _lock_with_checks           \
+              (kmp_ ## locktype ## _lock_t *lck) {                             \
+  __kmp_destroy_ ## locktype ## _lock(lck);                                    \
+}                                                                              \
+int __kmp_acquire_nested_ ## locktype ## _lock                                 \
+      (kmp_ ## locktype ## _lock_t *lck, kmp_int32 gtid) {                     \
+  if (lck->owner_gtid == gtid) {                                               \
+    lck->nest_level++;                                                         \
+    return 0;                                                                  \
+  }                                                                            \
+  while (KMP_XCHG_FIXED8(&lck->lock, 1)) {                                     \
+    while (*((volatile char *)&lck->lock) == 1) {                              \
+      if (ABT_initialized() == ABT_SUCCESS)                                    \
+        ABT_thread_yield();                                                    \
+    }                                                                          \
+    KMP_MB();                                                                  \
+  }                                                                            \
+  KMP_DEBUG_ASSERT(lck->owner_gtid != -2);                                     \
+  KMP_DEBUG_ASSERT(lck->nest_level == 0);                                      \
+  lck->owner_gtid = gtid;                                                      \
+  return 0;                                                                    \
+}                                                                              \
+int __kmp_test_nested_ ## locktype ## _lock                                    \
+      (kmp_ ## locktype ## _lock_t *lck, kmp_int32 gtid) {                     \
+  if (lck->owner_gtid == gtid) {                                               \
+    lck->nest_level++;                                                         \
+    return 0;                                                                  \
+  }                                                                            \
+  if (KMP_XCHG_FIXED8(&lck->lock, 1)) {                                        \
+    return FALSE;                                                              \
+  } else {                                                                     \
+    KMP_DEBUG_ASSERT(lck->owner_gtid != -2);                                   \
+    KMP_DEBUG_ASSERT(lck->nest_level == 0);                                    \
+    lck->owner_gtid = gtid;                                                    \
+    return TRUE;                                                               \
+  }                                                                            \
+}                                                                              \
+int __kmp_release_nested_ ## locktype ## _lock                                 \
+      (kmp_ ## locktype ## _lock_t *lck, kmp_int32 gtid) {                     \
+  if (lck->nest_level == 0) {                                                  \
+    lck->owner_gtid = -1;                                                      \
+    KMP_MB();                                                                  \
+    lck->lock = 0;                                                             \
+    KMP_MB();                                                                  \
+  } else {                                                                     \
+    lck->nest_level--;                                                         \
+  }                                                                            \
+  return 0;                                                                    \
+}                                                                              \
+void __kmp_init_nested_ ## locktype ## _lock                                   \
+       (kmp_ ## locktype ## _lock_t *lck) {                                    \
+  lck->initialized = lck;                                                      \
+  lck->owner_gtid = -1;                                                        \
+  lck->nest_level = 0;                                                         \
+  lck->lock = 0;                                                               \
+  lck->location = NULL;                                                        \
+  lck->flags = 0;                                                              \
+}                                                                              \
+void __kmp_destroy_nested_ ## locktype ## _lock                                \
+       (kmp_ ## locktype ## _lock_t *lck) {                                    \
+  KMP_MB();                                                                    \
+  lck->lock = 0;                                                               \
+  lck->owner_gtid = -1;                                                        \
+  lck->nest_level = 0;                                                         \
+  KMP_MB();                                                                    \
+}                                                                              \
+KMP_LOCK_STATIC int __kmp_acquire_nested_ ## locktype ## _lock_with_checks     \
+             (kmp_ ## locktype ## _lock_t *lck, kmp_int32 gtid) {              \
+  return __kmp_acquire_nested_ ## locktype ## _lock(lck, gtid);                \
+}                                                                              \
+KMP_LOCK_STATIC int __kmp_test_nested_ ## locktype ## _lock_with_checks        \
+             (kmp_ ## locktype ## _lock_t *lck, kmp_int32 gtid) {              \
+  return __kmp_test_nested_ ## locktype ## _lock(lck, gtid);                   \
+}                                                                              \
+KMP_LOCK_STATIC int __kmp_release_nested_ ## locktype ## _lock_with_checks     \
+             (kmp_ ## locktype ## _lock_t *lck, kmp_int32 gtid) {              \
+  return __kmp_release_nested_ ## locktype ## _lock(lck, gtid);                \
+}                                                                              \
+KMP_LOCK_STATIC void __kmp_init_nested_ ## locktype ## _lock_with_checks       \
+              (kmp_ ## locktype ## _lock_t *lck) {                             \
+  __kmp_init_nested_ ## locktype ## _lock(lck);                                \
+}                                                                              \
+KMP_LOCK_STATIC void __kmp_destroy_nested_ ## locktype ## _lock_with_checks    \
+              (kmp_ ## locktype ## _lock_t *lck) {                             \
+  __kmp_destroy_nested_ ## locktype ## _lock(lck);                             \
+}                                                                              \
+KMP_LOCK_STATIC const ident_t *__kmp_get_ ## locktype ## _lock_location        \
+                        (kmp_ ## locktype ## _lock_t *lck) {                   \
+  return lck->location;                                                        \
+}                                                                              \
+KMP_LOCK_STATIC void __kmp_set_ ## locktype ## _lock_location                  \
+              (kmp_ ## locktype ## _lock_t *lck, const ident_t *loc) {         \
+  lck->location = loc;                                                         \
+}                                                                              \
+KMP_LOCK_STATIC kmp_lock_flags_t __kmp_get_ ## locktype ## _lock_flags         \
+                          (kmp_ ## locktype ## _lock_t *lck) {                 \
+  return lck->flags;                                                           \
+}                                                                              \
+KMP_LOCK_STATIC void __kmp_set_ ## locktype ## _lock_flags                     \
+              (kmp_ ## locktype ## _lock_t *lck, kmp_lock_flags_t flags) {     \
+  lck->flags = flags;                                                          \
+}                                                                              \
+KMP_LOCK_STATIC kmp_int32 __kmp_get_ ## locktype ## _lock_owner                \
+                   (kmp_ ## locktype ## _lock_t *lck) {                        \
+  return 0;                                                                    \
+}
+
+KMP_DEFINE_LOCKS(tas)
+KMP_DEFINE_LOCKS(ticket)
+KMP_DEFINE_LOCKS(queuing)
+KMP_DEFINE_LOCKS(drdpa)
+#if KMP_USE_FUTEX
+KMP_DEFINE_LOCKS(futex)
+#endif
+#if KMP_USE_ADAPTIVE_LOCKS
+KMP_DEFINE_LOCKS(adaptive)
+#endif
+typedef kmp_abt_mutex_lock_t kmp_hle_lock_t;
+KMP_DEFINE_LOCKS(hle)
+typedef kmp_abt_mutex_lock_t kmp_rtm_spin_lock_t;
+KMP_DEFINE_LOCKS(rtm_spin)
+typedef kmp_abt_mutex_lock_t kmp_rtm_queuing_lock_t;
+KMP_DEFINE_LOCKS(rtm_queuing)
+
+#undef KMP_LOCK_STATIC
+
+int __kmp_acquire_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
+  return __kmp_abt_acquire_spin_lock(lck);
+}
+int __kmp_test_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
+  return __kmp_abt_test_spin_lock(lck);
+}
+void __kmp_release_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
+  __kmp_abt_release_spin_lock(lck);
+}
+void __kmp_init_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
+  __kmp_abt_init_spin_lock(lck);
+}
+void __kmp_destroy_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
+  __kmp_abt_destroy_spin_lock(lck);
+}
+
+int __kmp_acquire_rbootstrap_lock(kmp_rbootstrap_lock_t *lck) {
+  return __kmp_abt_acquire_rspin_lock_impl(lck, 0);
+}
+int __kmp_acquire_rbootstrap_lock_if_es(kmp_rbootstrap_lock_t *lck) {
+  return __kmp_abt_acquire_rspin_lock_impl(lck, 1);
+}
+int __kmp_test_rbootstrap_lock(kmp_rbootstrap_lock_t *lck) {
+  return __kmp_abt_test_rspin_lock(lck);
+}
+void __kmp_release_rbootstrap_lock(kmp_rbootstrap_lock_t *lck) {
+  __kmp_abt_release_rspin_lock_impl(lck, 0);
+}
+void __kmp_release_rbootstrap_lock_if_es(kmp_rbootstrap_lock_t *lck) {
+  __kmp_abt_release_rspin_lock_impl(lck, 1);
+}
+void __kmp_init_rbootstrap_lock(kmp_rbootstrap_lock_t *lck) {
+  __kmp_abt_init_rspin_lock(lck);
+}
+void __kmp_destroy_rbootstrap_lock(kmp_rbootstrap_lock_t *lck) {
+  __kmp_abt_destroy_rspin_lock(lck);
+}
+
+#else // KMP_USE_ABT
+
 /* Implement spin locks for internal library use.             */
 /* The algorithm implemented is Lamport's bakery lock [1974]. */
 
@@ -372,11 +608,11 @@ __kmp_acquire_futex_lock_timed_template(kmp_futex_lock_t *lck, kmp_int32 gtid) {
         ("__kmp_acquire_futex_lock: lck:%p, T#%d before futex_wait(0x%x)\n",
          lck, gtid, poll_val));
 
-    kmp_int32 rc;
+    long rc;
     if ((rc = syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAIT, poll_val, NULL,
                       NULL, 0)) != 0) {
       KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p, T#%d futex_wait(0x%x) "
-                      "failed (rc=%d errno=%d)\n",
+                      "failed (rc=%ld errno=%d)\n",
                       lck, gtid, poll_val, rc, errno));
       continue;
     }
@@ -1239,6 +1475,9 @@ __kmp_acquire_queuing_lock_timed_template(kmp_queuing_lock_t *lck,
       KMP_MB();
       // ToDo: Use __kmp_wait_sleep or similar when blocktime != inf
       KMP_WAIT(spin_here_p, FALSE, KMP_EQ, lck);
+      // Synchronize writes to both runtime thread structures
+      // and writes in user code.
+      KMP_MB();
 
 #ifdef DEBUG_QUEUING_LOCKS
       TRACE_LOCK(gtid + 1, "acq spin");
@@ -1701,10 +1940,7 @@ static void __kmp_set_queuing_lock_flags(kmp_queuing_lock_t *lck,
 
 /* RTM Adaptive locks */
 
-#if (KMP_COMPILER_ICC && __INTEL_COMPILER >= 1300) ||                          \
-    (KMP_COMPILER_MSVC && _MSC_VER >= 1700) ||                                 \
-    (KMP_COMPILER_CLANG && KMP_MSVC_COMPAT)
-
+#if KMP_HAVE_RTM_INTRINSICS
 #include <immintrin.h>
 #define SOFT_ABORT_MASK (_XABORT_RETRY | _XABORT_CONFLICT | _XABORT_EXPLICIT)
 
@@ -1891,20 +2127,6 @@ static float percent(kmp_uint32 count, kmp_uint32 total) {
   return (total == 0) ? 0.0 : (100.0 * count) / total;
 }
 
-static FILE *__kmp_open_stats_file() {
-  if (strcmp(__kmp_speculative_statsfile, "-") == 0)
-    return stdout;
-
-  size_t buffLen = KMP_STRLEN(__kmp_speculative_statsfile) + 20;
-  char buffer[buffLen];
-  KMP_SNPRINTF(&buffer[0], buffLen, __kmp_speculative_statsfile,
-               (kmp_int32)getpid());
-  FILE *result = fopen(&buffer[0], "w");
-
-  // Maybe we should issue a warning here...
-  return result ? result : stdout;
-}
-
 void __kmp_print_speculative_stats() {
   kmp_adaptive_lock_statistics_t total = destroyedStats;
   kmp_adaptive_lock_info_t *lck;
@@ -1921,7 +2143,16 @@ void __kmp_print_speculative_stats() {
   if (totalSections <= 0)
     return;
 
-  FILE *statsFile = __kmp_open_stats_file();
+  kmp_safe_raii_file_t statsFile;
+  if (strcmp(__kmp_speculative_statsfile, "-") == 0) {
+    statsFile.set_stdout();
+  } else {
+    size_t buffLen = KMP_STRLEN(__kmp_speculative_statsfile) + 20;
+    char buffer[buffLen];
+    KMP_SNPRINTF(&buffer[0], buffLen, __kmp_speculative_statsfile,
+                 (kmp_int32)getpid());
+    statsFile.open(buffer, "w");
+  }
 
   fprintf(statsFile, "Speculative lock statistics (all approximate!)\n");
   fprintf(statsFile, " Lock parameters: \n"
@@ -1953,9 +2184,6 @@ void __kmp_print_speculative_stats() {
   fprintf(statsFile, " Hard failures                    : %10d (%5.1f%%)\n",
           t->hardFailedSpeculations,
           percent(t->hardFailedSpeculations, totalSpeculations));
-
-  if (statsFile != stdout)
-    fclose(statsFile);
 }
 
 #define KMP_INC_STAT(lck, stat) (lck->lk.adaptive.stats.stat++)
@@ -1999,6 +2227,7 @@ static __inline void __kmp_step_badness(kmp_adaptive_lock_t *lck) {
 }
 
 // Check whether speculation should be attempted.
+KMP_ATTRIBUTE_TARGET_RTM
 static __inline int __kmp_should_speculate(kmp_adaptive_lock_t *lck,
                                            kmp_int32 gtid) {
   kmp_uint32 badness = lck->lk.adaptive.badness;
@@ -2009,6 +2238,7 @@ static __inline int __kmp_should_speculate(kmp_adaptive_lock_t *lck,
 
 // Attempt to acquire only the speculative lock.
 // Does not back off to the non-speculative lock.
+KMP_ATTRIBUTE_TARGET_RTM
 static int __kmp_test_adaptive_lock_only(kmp_adaptive_lock_t *lck,
                                          kmp_int32 gtid) {
   int retries = lck->lk.adaptive.max_soft_retries;
@@ -2150,6 +2380,7 @@ static void __kmp_acquire_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
   lck->lk.qlk.owner_id = gtid + 1;
 }
 
+KMP_ATTRIBUTE_TARGET_RTM
 static int __kmp_release_adaptive_lock(kmp_adaptive_lock_t *lck,
                                        kmp_int32 gtid) {
   if (__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(
@@ -2655,6 +2886,8 @@ static void __kmp_set_drdpa_lock_flags(kmp_drdpa_lock_t *lck,
   lck->lk.flags = flags;
 }
 
+#endif // !KMP_USE_ABT
+
 // Time stamp counter
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 #define __kmp_tsc() __kmp_hardware_timestamp()
@@ -2703,7 +2936,7 @@ static void __kmp_init_direct_lock(kmp_dyna_lock_t *lck,
       ("__kmp_init_direct_lock: initialized direct lock with type#%d\n", seq));
 }
 
-#if KMP_USE_TSX
+#if KMP_USE_TSX && !KMP_USE_ABT
 
 // HLE lock functions - imported from the testbed runtime.
 #define HLE_ACQUIRE ".byte 0xf2;"
@@ -2761,19 +2994,22 @@ static int __kmp_test_hle_lock_with_checks(kmp_dyna_lock_t *lck,
   return __kmp_test_hle_lock(lck, gtid); // TODO: add checks
 }
 
-static void __kmp_init_rtm_lock(kmp_queuing_lock_t *lck) {
+static void __kmp_init_rtm_queuing_lock(kmp_queuing_lock_t *lck) {
   __kmp_init_queuing_lock(lck);
 }
 
-static void __kmp_destroy_rtm_lock(kmp_queuing_lock_t *lck) {
+static void __kmp_destroy_rtm_queuing_lock(kmp_queuing_lock_t *lck) {
   __kmp_destroy_queuing_lock(lck);
 }
 
-static void __kmp_destroy_rtm_lock_with_checks(kmp_queuing_lock_t *lck) {
+static void
+__kmp_destroy_rtm_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
   __kmp_destroy_queuing_lock_with_checks(lck);
 }
 
-static void __kmp_acquire_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
+KMP_ATTRIBUTE_TARGET_RTM
+static void __kmp_acquire_rtm_queuing_lock(kmp_queuing_lock_t *lck,
+                                           kmp_int32 gtid) {
   unsigned retries = 3, status;
   do {
     status = _xbegin();
@@ -2795,12 +3031,14 @@ static void __kmp_acquire_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
   __kmp_acquire_queuing_lock(lck, gtid);
 }
 
-static void __kmp_acquire_rtm_lock_with_checks(kmp_queuing_lock_t *lck,
-                                               kmp_int32 gtid) {
-  __kmp_acquire_rtm_lock(lck, gtid);
+static void __kmp_acquire_rtm_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
+                                                       kmp_int32 gtid) {
+  __kmp_acquire_rtm_queuing_lock(lck, gtid);
 }
 
-static int __kmp_release_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
+KMP_ATTRIBUTE_TARGET_RTM
+static int __kmp_release_rtm_queuing_lock(kmp_queuing_lock_t *lck,
+                                          kmp_int32 gtid) {
   if (__kmp_is_unlocked_queuing_lock(lck)) {
     // Releasing from speculation
     _xend();
@@ -2811,12 +3049,14 @@ static int __kmp_release_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
   return KMP_LOCK_RELEASED;
 }
 
-static int __kmp_release_rtm_lock_with_checks(kmp_queuing_lock_t *lck,
-                                              kmp_int32 gtid) {
-  return __kmp_release_rtm_lock(lck, gtid);
+static int __kmp_release_rtm_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
+                                                      kmp_int32 gtid) {
+  return __kmp_release_rtm_queuing_lock(lck, gtid);
 }
 
-static int __kmp_test_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
+KMP_ATTRIBUTE_TARGET_RTM
+static int __kmp_test_rtm_queuing_lock(kmp_queuing_lock_t *lck,
+                                       kmp_int32 gtid) {
   unsigned retries = 3, status;
   do {
     status = _xbegin();
@@ -2827,15 +3067,111 @@ static int __kmp_test_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
       break;
   } while (retries--);
 
-  return (__kmp_is_unlocked_queuing_lock(lck)) ? 1 : 0;
+  return __kmp_test_queuing_lock(lck, gtid);
 }
 
-static int __kmp_test_rtm_lock_with_checks(kmp_queuing_lock_t *lck,
-                                           kmp_int32 gtid) {
-  return __kmp_test_rtm_lock(lck, gtid);
+static int __kmp_test_rtm_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
+                                                   kmp_int32 gtid) {
+  return __kmp_test_rtm_queuing_lock(lck, gtid);
+}
+
+// Reuse kmp_tas_lock_t for TSX lock which use RTM with fall-back spin lock.
+typedef kmp_tas_lock_t kmp_rtm_spin_lock_t;
+
+static void __kmp_destroy_rtm_spin_lock(kmp_rtm_spin_lock_t *lck) {
+  KMP_ATOMIC_ST_REL(&lck->lk.poll, 0);
+}
+
+static void __kmp_destroy_rtm_spin_lock_with_checks(kmp_rtm_spin_lock_t *lck) {
+  __kmp_destroy_rtm_spin_lock(lck);
+}
+
+KMP_ATTRIBUTE_TARGET_RTM
+static int __kmp_acquire_rtm_spin_lock(kmp_rtm_spin_lock_t *lck,
+                                       kmp_int32 gtid) {
+  unsigned retries = 3, status;
+  kmp_int32 lock_free = KMP_LOCK_FREE(rtm_spin);
+  kmp_int32 lock_busy = KMP_LOCK_BUSY(1, rtm_spin);
+  do {
+    status = _xbegin();
+    if (status == _XBEGIN_STARTED) {
+      if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == lock_free)
+        return KMP_LOCK_ACQUIRED_FIRST;
+      _xabort(0xff);
+    }
+    if ((status & _XABORT_EXPLICIT) && _XABORT_CODE(status) == 0xff) {
+      // Wait until lock becomes free
+      while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != lock_free) {
+        KMP_YIELD(TRUE);
+      }
+    } else if (!(status & _XABORT_RETRY))
+      break;
+  } while (retries--);
+
+  // Fall-back spin lock
+  KMP_FSYNC_PREPARE(lck);
+  kmp_backoff_t backoff = __kmp_spin_backoff_params;
+  while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != lock_free ||
+         !__kmp_atomic_compare_store_acq(&lck->lk.poll, lock_free, lock_busy)) {
+    __kmp_spin_backoff(&backoff);
+  }
+  KMP_FSYNC_ACQUIRED(lck);
+  return KMP_LOCK_ACQUIRED_FIRST;
+}
+
+static int __kmp_acquire_rtm_spin_lock_with_checks(kmp_rtm_spin_lock_t *lck,
+                                                   kmp_int32 gtid) {
+  return __kmp_acquire_rtm_spin_lock(lck, gtid);
+}
+
+KMP_ATTRIBUTE_TARGET_RTM
+static int __kmp_release_rtm_spin_lock(kmp_rtm_spin_lock_t *lck,
+                                       kmp_int32 gtid) {
+  if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == KMP_LOCK_FREE(rtm_spin)) {
+    // Releasing from speculation
+    _xend();
+  } else {
+    // Releasing from a real lock
+    KMP_FSYNC_RELEASING(lck);
+    KMP_ATOMIC_ST_REL(&lck->lk.poll, KMP_LOCK_FREE(rtm_spin));
+  }
+  return KMP_LOCK_RELEASED;
+}
+
+static int __kmp_release_rtm_spin_lock_with_checks(kmp_rtm_spin_lock_t *lck,
+                                                   kmp_int32 gtid) {
+  return __kmp_release_rtm_spin_lock(lck, gtid);
+}
+
+KMP_ATTRIBUTE_TARGET_RTM
+static int __kmp_test_rtm_spin_lock(kmp_rtm_spin_lock_t *lck, kmp_int32 gtid) {
+  unsigned retries = 3, status;
+  kmp_int32 lock_free = KMP_LOCK_FREE(rtm_spin);
+  kmp_int32 lock_busy = KMP_LOCK_BUSY(1, rtm_spin);
+  do {
+    status = _xbegin();
+    if (status == _XBEGIN_STARTED &&
+        KMP_ATOMIC_LD_RLX(&lck->lk.poll) == lock_free) {
+      return TRUE;
+    }
+    if (!(status & _XABORT_RETRY))
+      break;
+  } while (retries--);
+
+  if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == lock_free &&
+      __kmp_atomic_compare_store_acq(&lck->lk.poll, lock_free, lock_busy)) {
+    KMP_FSYNC_ACQUIRED(lck);
+    return TRUE;
+  }
+  return FALSE;
+}
+
+static int __kmp_test_rtm_spin_lock_with_checks(kmp_rtm_spin_lock_t *lck,
+                                                kmp_int32 gtid) {
+  return __kmp_test_rtm_spin_lock(lck, gtid);
 }
 
-#endif // KMP_USE_TSX
+#endif // KMP_USE_TSX && !KMP_USE_ABT
 
 // Entry functions for indirect locks (first element of direct lock jump tables)
 static void __kmp_init_indirect_lock(kmp_dyna_lock_t *l,
@@ -3018,7 +3354,7 @@ kmp_lock_flags_t (*__kmp_indirect_get_flags[KMP_NUM_I_LOCKS])(
 static kmp_indirect_lock_t *__kmp_indirect_lock_pool[KMP_NUM_I_LOCKS] = {0};
 
 // User lock allocator for dynamically dispatched indirect locks. Every entry of
-// the indirect lock table holds the address and type of the allocated indrect
+// the indirect lock table holds the address and type of the allocated indirect
 // lock (kmp_indirect_lock_t), and the size of the table doubles when it is
 // full. A destroyed indirect lock object is returned to the reusable pool of
 // locks, unique to each lock type.
@@ -3118,7 +3454,7 @@ static void __kmp_init_indirect_lock(kmp_dyna_lock_t *lock,
   }
 #endif
 #if KMP_USE_TSX
-  if (seq == lockseq_rtm && !__kmp_cpuinfo.rtm) {
+  if (seq == lockseq_rtm_queuing && !__kmp_cpuinfo.rtm) {
     seq = lockseq_queuing;
   }
 #endif
@@ -3260,7 +3596,7 @@ void __kmp_init_dynamic_user_locks() {
 #endif
   __kmp_indirect_lock_size[locktag_drdpa] = sizeof(kmp_drdpa_lock_t);
 #if KMP_USE_TSX
-  __kmp_indirect_lock_size[locktag_rtm] = sizeof(kmp_queuing_lock_t);
+  __kmp_indirect_lock_size[locktag_rtm_queuing] = sizeof(kmp_queuing_lock_t);
 #endif
   __kmp_indirect_lock_size[locktag_nested_tas] = sizeof(kmp_tas_lock_t);
 #if KMP_USE_FUTEX
@@ -3883,7 +4219,7 @@ void __kmp_cleanup_user_locks(void) {
       if (__kmp_env_consistency_check && (!IS_CRITICAL(lck)) &&
           ((loc = __kmp_get_user_lock_location(lck)) != NULL) &&
           (loc->psource != NULL)) {
-        kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 0);
+        kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
         KMP_WARNING(CnsLockNotDestroyed, str_loc.file, str_loc.line);
         __kmp_str_loc_free(&str_loc);
       }
diff --git a/runtime/src/kmp_lock.h b/runtime/src/kmp_lock.h
index 9ad86a516..8d5c044a0 100644
--- a/runtime/src/kmp_lock.h
+++ b/runtime/src/kmp_lock.h
@@ -19,6 +19,10 @@
 #include "kmp_debug.h"
 #include "kmp_os.h"
 
+#if KMP_USE_ABT
+#include <abt.h>
+#endif
+
 #ifdef __cplusplus
 #include <atomic>
 
@@ -42,7 +46,7 @@ typedef struct ident ident_t;
 // ----------------------------------------------------------------------------
 
 // We need to know the size of the area we can assume that the compiler(s)
-// allocated for obects of type omp_lock_t and omp_nest_lock_t.  The Intel
+// allocated for objects of type omp_lock_t and omp_nest_lock_t.  The Intel
 // compiler always allocates a pointer-sized area, as does visual studio.
 //
 // gcc however, only allocates 4 bytes for regular locks, even on 64-bit
@@ -83,6 +87,157 @@ typedef struct kmp_lock_pool kmp_lock_pool_t;
 
 extern void __kmp_validate_locks(void);
 
+#if KMP_USE_ABT
+
+struct KMP_ALIGN_CACHE kmp_abt_mutex_lock {
+  // `initialized' must be the first entry in the lock data structure!
+  volatile kmp_abt_mutex_lock *initialized;
+  kmp_int32 owner_gtid, nest_level;
+  char lock;
+  ident_t const *location;
+  kmp_lock_flags_t flags;
+};
+
+#define KMP_ABT_MUTEX_LOCK_INITIALIZER(lock) {NULL, 0, 0, 0, NULL, 0}
+
+typedef struct kmp_abt_mutex_lock kmp_abt_mutex_lock_t;
+
+struct KMP_ALIGN_CACHE kmp_abt_spin_lock {
+  char lock;
+};
+typedef struct kmp_abt_spin_lock kmp_abt_spin_lock_t;
+
+static inline int __kmp_abt_acquire_spin_lock(kmp_abt_spin_lock_t *lck) {
+  while (KMP_XCHG_FIXED8(&lck->lock, 1)) {
+    while (*((volatile char *)&lck->lock) == 1) {
+      if (ABT_initialized() == ABT_SUCCESS)
+        ABT_thread_yield();
+    }
+    KMP_MB();
+  }
+  return 0;
+}
+
+static inline int __kmp_abt_test_spin_lock(kmp_abt_spin_lock_t *lck) {
+  if (KMP_XCHG_FIXED8(&lck->lock, 1)) {
+    return 1;
+  } else {
+    // Successfully take the lock.
+    return 0;
+  }
+}
+
+static inline void __kmp_abt_release_spin_lock(kmp_abt_spin_lock_t *lck) {
+  KMP_MB();
+  lck->lock = 0;
+  KMP_MB();
+}
+
+static inline void __kmp_abt_init_spin_lock(kmp_abt_spin_lock_t *lck) {
+  lck->lock = 0;
+}
+
+static inline void __kmp_abt_destroy_spin_lock(kmp_abt_spin_lock_t *lck) {
+  ; // Do nothing.
+}
+
+#define KMP_ABT_SPIN_LOCK_INITIALIZER(lock) {0}
+
+struct KMP_ALIGN_CACHE kmp_abt_rspin_lock {
+  char lock;
+  int owner_rank;
+  int depth;
+};
+typedef struct kmp_abt_rspin_lock kmp_abt_rspin_lock_t;
+
+static inline int __kmp_abt_acquire_rspin_lock_impl(kmp_abt_rspin_lock_t *lck,
+                                                    int if_es_flag) {
+BEGIN_SPIN_LOCK:
+  int rank;
+  {
+    if (ABT_initialized() != ABT_SUCCESS) {
+      rank = -1;
+    } else if (ABT_xstream_self_rank(&rank) != ABT_SUCCESS) {
+      rank = -1;
+    }
+  }
+  if (if_es_flag && rank == -1)
+    return 0;
+  if (rank >= 0 && lck->owner_rank == rank) {
+    lck->depth++;
+  } else {
+    if (KMP_XCHG_FIXED8(&lck->lock, 1)) {
+      if (ABT_initialized() == ABT_SUCCESS)
+        ABT_thread_yield();
+      goto BEGIN_SPIN_LOCK;
+    }
+    KMP_MB();
+    // Successfully take the lock.
+    KMP_DEBUG_ASSERT(lck->depth == 0);
+    KMP_DEBUG_ASSERT(lck->owner_rank == -1);
+    lck->owner_rank = rank;
+  }
+  return 0;
+}
+
+static inline int __kmp_abt_test_rspin_lock(kmp_abt_rspin_lock_t *lck) {
+  int rank;
+  {
+    if (ABT_initialized() != ABT_SUCCESS) {
+      rank = -1;
+    } else if (ABT_xstream_self_rank(&rank) != ABT_SUCCESS) {
+      rank = -1;
+    }
+  }
+  if (rank >= 0 && lck->owner_rank == rank) {
+    // Successfully take the lock.
+    lck->depth++;
+    return 0;
+  } else {
+    if (KMP_XCHG_FIXED8(&lck->lock, 1)) {
+      return 1;
+    } else {
+      // Successfully take the lock.
+      KMP_DEBUG_ASSERT(lck->depth == 0);
+      KMP_DEBUG_ASSERT(lck->owner_rank == -1);
+      lck->owner_rank = rank;
+      return 0;
+    }
+  }
+}
+
+static inline void __kmp_abt_release_rspin_lock_impl(kmp_abt_rspin_lock_t *lck,
+                                                     int if_es_flag) {
+  if (if_es_flag) {
+    int rank;
+    if (ABT_initialized() != ABT_SUCCESS
+        || ABT_xstream_self_rank(&rank) != ABT_SUCCESS)
+        return;
+  }
+  if (lck->depth == 0) {
+    lck->owner_rank = -1;
+    KMP_MB();
+    lck->lock = 0;
+    KMP_MB();
+  } else {
+    lck->depth--;
+  }
+}
+
+static inline void __kmp_abt_init_rspin_lock(kmp_abt_rspin_lock_t *lck) {
+  lck->lock = 0;
+  lck->owner_rank = -1;
+  lck->depth = 0;
+}
+
+static inline void __kmp_abt_destroy_rspin_lock(kmp_abt_rspin_lock_t *lck) {
+  ; // Do nothing.
+}
+
+#define KMP_ABT_RSPIN_LOCK_INITIALIZER(lock) {0, -1, 0}
+
+#endif // KMP_USE_ABT
+
 // ----------------------------------------------------------------------------
 //  There are 5 lock implementations:
 //       1. Test and set locks.
@@ -118,6 +273,14 @@ extern void __kmp_validate_locks(void);
 // set locks.
 // ----------------------------------------------------------------------------
 
+#if KMP_USE_ABT
+
+typedef kmp_abt_mutex_lock_t kmp_tas_lock_t;
+
+#define KMP_TAS_LOCK_INITIALIZER KMP_ABT_MUTEX_LOCK_INITIALIZER
+
+#else // KMP_USE_ABT
+
 struct kmp_base_tas_lock {
   // KMP_LOCK_FREE(tas) => unlocked; locked: (gtid+1) of owning thread
   std::atomic<kmp_int32> poll;
@@ -141,6 +304,8 @@ typedef union kmp_tas_lock kmp_tas_lock_t;
     { ATOMIC_VAR_INIT(KMP_LOCK_FREE(tas)), 0 }                                 \
   }
 
+#endif // !KMP_USE_ABT
+
 extern int __kmp_acquire_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_test_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_release_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
@@ -159,7 +324,7 @@ extern void __kmp_destroy_nested_tas_lock(kmp_tas_lock_t *lck);
 #define KMP_LOCK_ACQUIRED_NEXT 0
 #ifndef KMP_USE_FUTEX
 #define KMP_USE_FUTEX                                                          \
-  (KMP_OS_LINUX && !KMP_OS_CNK &&                                              \
+  (KMP_OS_LINUX &&                                                             \
    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64))
 #endif
 #if KMP_USE_FUTEX
@@ -175,6 +340,14 @@ extern void __kmp_destroy_nested_tas_lock(kmp_tas_lock_t *lck);
 // set locks. With non-nested futex locks, the lock owner is not even available.
 // ----------------------------------------------------------------------------
 
+#if KMP_USE_ABT
+
+typedef kmp_abt_mutex_lock_t kmp_futex_lock_t;
+
+#define KMP_FUTEX_LOCK_INITIALIZER KMP_ABT_MUTEX_LOCK_INITIALIZER
+
+#else // KMP_USE_ABT
+
 struct kmp_base_futex_lock {
   volatile kmp_int32 poll; // KMP_LOCK_FREE(futex) => unlocked
   // 2*(gtid+1) of owning thread, 0 if unlocked
@@ -200,6 +373,8 @@ typedef union kmp_futex_lock kmp_futex_lock_t;
     { KMP_LOCK_FREE(futex), 0 }                                                \
   }
 
+#endif // !KMP_USE_ABT
+
 extern int __kmp_acquire_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_test_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_release_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid);
@@ -219,6 +394,15 @@ extern void __kmp_destroy_nested_futex_lock(kmp_futex_lock_t *lck);
 // ----------------------------------------------------------------------------
 // Ticket locks.
 
+
+#if KMP_USE_ABT
+
+typedef kmp_abt_mutex_lock_t kmp_ticket_lock_t;
+
+#define KMP_TICKET_LOCK_INITIALIZER KMP_ABT_MUTEX_LOCK_INITIALIZER
+
+#else // KMP_USE_ABT
+
 #ifdef __cplusplus
 
 #ifdef _MSC_VER
@@ -283,6 +467,8 @@ typedef union kmp_ticket_lock kmp_ticket_lock_t;
     }                                                                          \
   }
 
+#endif // !KMP_USE_ABT
+
 extern int __kmp_acquire_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_test_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_test_ticket_lock_with_cheks(kmp_ticket_lock_t *lck,
@@ -303,6 +489,12 @@ extern void __kmp_destroy_nested_ticket_lock(kmp_ticket_lock_t *lck);
 // ----------------------------------------------------------------------------
 // Queuing locks.
 
+#if KMP_USE_ABT
+
+typedef kmp_abt_mutex_lock_t kmp_queuing_lock_t;
+
+#else // KMP_USE_ABT
+
 #if KMP_USE_ADAPTIVE_LOCKS
 
 struct kmp_adaptive_lock_info;
@@ -391,6 +583,8 @@ union KMP_ALIGN_CACHE kmp_queuing_lock {
 
 typedef union kmp_queuing_lock kmp_queuing_lock_t;
 
+#endif // !KMP_USE_ABT
+
 extern int __kmp_acquire_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_test_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_release_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid);
@@ -410,6 +604,12 @@ extern void __kmp_destroy_nested_queuing_lock(kmp_queuing_lock_t *lck);
 
 // ----------------------------------------------------------------------------
 // Adaptive locks.
+#if KMP_USE_ABT
+
+typedef kmp_abt_mutex_lock_t kmp_adaptive_lock_t;
+
+#else // KMP_USE_ABT
+
 struct kmp_base_adaptive_lock {
   kmp_base_queuing_lock qlk;
   KMP_ALIGN(CACHE_LINE)
@@ -429,10 +629,18 @@ typedef union kmp_adaptive_lock kmp_adaptive_lock_t;
 
 #define GET_QLK_PTR(l) ((kmp_queuing_lock_t *)&(l)->lk.qlk)
 
+#endif // !KMP_USE_ABT
+
 #endif // KMP_USE_ADAPTIVE_LOCKS
 
 // ----------------------------------------------------------------------------
 // DRDPA ticket locks.
+#if KMP_USE_ABT
+
+typedef kmp_abt_mutex_lock_t kmp_drdpa_lock_t;
+
+#else // KMP_USE_ABT
+
 struct kmp_base_drdpa_lock {
   // All of the fields on the first cache line are only written when
   // initializing or reconfiguring the lock.  These are relatively rare
@@ -462,7 +670,7 @@ struct kmp_base_drdpa_lock {
   // written by the acquiring thread) than it does in the simple ticket locks
   // (where it is written by the releasing thread).
   //
-  // Since now_serving is only read an written in the critical section,
+  // Since now_serving is only read and written in the critical section,
   // it is non-volatile, but it needs to exist on a separate cache line,
   // as it is invalidated at every lock acquire.
   //
@@ -488,6 +696,8 @@ union KMP_ALIGN_CACHE kmp_drdpa_lock {
 
 typedef union kmp_drdpa_lock kmp_drdpa_lock_t;
 
+#endif // !KMP_USE_ABT
+
 extern int __kmp_acquire_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_test_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_release_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid);
@@ -514,6 +724,48 @@ extern void __kmp_destroy_nested_drdpa_lock(kmp_drdpa_lock_t *lck);
 // implemented with other lock kinds as they require gtids which are not
 // available at initialization time.
 
+#if KMP_USE_ABT
+
+typedef kmp_abt_spin_lock_t kmp_bootstrap_lock_t;
+
+extern int __kmp_acquire_bootstrap_lock(kmp_bootstrap_lock_t *lck);
+extern int __kmp_test_bootstrap_lock(kmp_bootstrap_lock_t *lck);
+extern void __kmp_release_bootstrap_lock(kmp_bootstrap_lock_t *lck);
+extern void __kmp_init_bootstrap_lock(kmp_bootstrap_lock_t *lck);
+extern void __kmp_destroy_bootstrap_lock(kmp_bootstrap_lock_t *lck);
+
+static inline kmp_bootstrap_lock_t __kmp_bootstrap_lock_init_helper
+                                   (kmp_bootstrap_lock_t *lck) {
+  __kmp_init_bootstrap_lock(lck);
+  return *lck;
+}
+#define KMP_BOOTSTRAP_LOCK_INITIALIZER(lock)                                   \
+        __kmp_bootstrap_lock_init_helper(&lock)
+#define KMP_BOOTSTRAP_LOCK_INIT(lock)                                          \
+  kmp_bootstrap_lock_t lock = KMP_ABT_SPIN_LOCK_INITIALIZER(lock)
+
+typedef kmp_abt_rspin_lock_t kmp_rbootstrap_lock_t;
+
+// Recursive bootstrap lock. An owner must be an execution stream to
+// acquire the lock recursively.
+extern int __kmp_acquire_rbootstrap_lock(kmp_rbootstrap_lock_t *lck);
+extern int __kmp_acquire_rbootstrap_lock_if_es(kmp_rbootstrap_lock_t *lck);
+extern int __kmp_test_rbootstrap_lock(kmp_rbootstrap_lock_t *lck);
+extern void __kmp_release_rbootstrap_lock(kmp_rbootstrap_lock_t *lck);
+extern void __kmp_release_rbootstrap_lock_if_es(kmp_rbootstrap_lock_t *lck);
+extern void __kmp_init_rbootstrap_lock(kmp_rbootstrap_lock_t *lck);
+extern void __kmp_destroy_rbootstrap_lock(kmp_rbootstrap_lock_t *lck);
+
+static inline kmp_rbootstrap_lock_t __kmp_rbootstrap_lock_init_helper
+                                    (kmp_rbootstrap_lock_t *lck) {
+  __kmp_init_rbootstrap_lock(lck);
+  return *lck;
+}
+#define KMP_RBOOTSTRAP_LOCK_INITIALIZER(lock) \
+        __kmp_rbootstrap_lock_init_helper(&lock)
+
+#else // KMP_USE_ABT
+
 typedef kmp_ticket_lock_t kmp_bootstrap_lock_t;
 
 #define KMP_BOOTSTRAP_LOCK_INITIALIZER(lock) KMP_TICKET_LOCK_INITIALIZER((lock))
@@ -540,6 +792,8 @@ static inline void __kmp_destroy_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
   __kmp_destroy_ticket_lock(lck);
 }
 
+#endif // !KMP_USE_ABT
+
 // Internal RTL locks.
 //
 // Internal RTL locks are also implemented as ticket locks, for now.
@@ -587,7 +841,8 @@ enum kmp_lock_kind {
 #endif
 #if KMP_USE_DYNAMIC_LOCK && KMP_USE_TSX
   lk_hle,
-  lk_rtm,
+  lk_rtm_queuing,
+  lk_rtm_spin,
 #endif
   lk_ticket,
   lk_queuing,
@@ -861,11 +1116,11 @@ __kmp_destroy_nested_user_lock_with_checks(kmp_user_lock_p lck) {
 //
 // In other cases, the calling code really should differentiate between an
 // unimplemented function and one that is implemented but returning NULL /
-// invalied value.  If this is the case, no get function wrapper exists.
+// invalid value.  If this is the case, no get function wrapper exists.
 
 extern int (*__kmp_is_user_lock_initialized_)(kmp_user_lock_p lck);
 
-// no set function; fields set durining local allocation
+// no set function; fields set during local allocation
 
 extern const ident_t *(*__kmp_get_user_lock_location_)(kmp_user_lock_p lck);
 
@@ -899,7 +1154,7 @@ static inline void __kmp_set_user_lock_flags(kmp_user_lock_p lck,
   }
 }
 
-// The fuction which sets up all of the vtbl pointers for kmp_user_lock_t.
+// The function which sets up all of the vtbl pointers for kmp_user_lock_t.
 extern void __kmp_set_user_lock_vptrs(kmp_lock_kind_t user_lock_kind);
 
 // Macros for binding user lock functions.
@@ -1033,27 +1288,28 @@ extern void __kmp_cleanup_user_locks();
 
 // Shortcuts
 #define KMP_USE_INLINED_TAS                                                    \
-  (KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)) && 1
-#define KMP_USE_INLINED_FUTEX KMP_USE_FUTEX && 0
+  (KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM))          \
+  && !KMP_USE_ABT && 1
+#define KMP_USE_INLINED_FUTEX KMP_USE_FUTEX && !KMP_USE_ABT && 0
 
 // List of lock definitions; all nested locks are indirect locks.
 // hle lock is xchg lock prefixed with XACQUIRE/XRELEASE.
 // All nested locks are indirect lock types.
 #if KMP_USE_TSX
 #if KMP_USE_FUTEX
-#define KMP_FOREACH_D_LOCK(m, a) m(tas, a) m(futex, a) m(hle, a)
+#define KMP_FOREACH_D_LOCK(m, a) m(tas, a) m(futex, a) m(hle, a) m(rtm_spin, a)
 #define KMP_FOREACH_I_LOCK(m, a)                                               \
-  m(ticket, a) m(queuing, a) m(adaptive, a) m(drdpa, a) m(rtm, a)              \
+  m(ticket, a) m(queuing, a) m(adaptive, a) m(drdpa, a) m(rtm_queuing, a)      \
       m(nested_tas, a) m(nested_futex, a) m(nested_ticket, a)                  \
           m(nested_queuing, a) m(nested_drdpa, a)
 #else
-#define KMP_FOREACH_D_LOCK(m, a) m(tas, a) m(hle, a)
+#define KMP_FOREACH_D_LOCK(m, a) m(tas, a) m(hle, a) m(rtm_spin, a)
 #define KMP_FOREACH_I_LOCK(m, a)                                               \
-  m(ticket, a) m(queuing, a) m(adaptive, a) m(drdpa, a) m(rtm, a)              \
+  m(ticket, a) m(queuing, a) m(adaptive, a) m(drdpa, a) m(rtm_queuing, a)      \
       m(nested_tas, a) m(nested_ticket, a) m(nested_queuing, a)                \
           m(nested_drdpa, a)
 #endif // KMP_USE_FUTEX
-#define KMP_LAST_D_LOCK lockseq_hle
+#define KMP_LAST_D_LOCK lockseq_rtm_spin
 #else
 #if KMP_USE_FUTEX
 #define KMP_FOREACH_D_LOCK(m, a) m(tas, a) m(futex, a)
@@ -1128,7 +1384,7 @@ extern int (**__kmp_direct_unset)(kmp_dyna_lock_t *, kmp_int32);
 extern int (**__kmp_direct_test)(kmp_dyna_lock_t *, kmp_int32);
 
 // Function tables for indirect locks. Set/unset/test differentiate functions
-// with/withuot consistency checking.
+// with/without consistency checking.
 extern void (*__kmp_indirect_init[])(kmp_user_lock_p);
 extern void (**__kmp_indirect_destroy)(kmp_user_lock_p);
 extern int (**__kmp_indirect_set)(kmp_user_lock_p, kmp_int32);
diff --git a/runtime/src/kmp_omp.h b/runtime/src/kmp_omp.h
index 27b550d1f..c7ba32a14 100644
--- a/runtime/src/kmp_omp.h
+++ b/runtime/src/kmp_omp.h
@@ -47,7 +47,7 @@ typedef struct {
 } kmp_omp_nthr_item_t;
 
 typedef struct {
-  kmp_int32 num; // Number of items in the arrray.
+  kmp_int32 num; // Number of items in the array.
   kmp_uint64 array; // Address of array of kmp_omp_num_threads_item_t.
 } kmp_omp_nthr_info_t;
 
diff --git a/runtime/src/kmp_os.h b/runtime/src/kmp_os.h
index cd942a9c4..91c8c584f 100644
--- a/runtime/src/kmp_os.h
+++ b/runtime/src/kmp_os.h
@@ -14,8 +14,9 @@
 #define KMP_OS_H
 
 #include "kmp_config.h"
-#include <stdlib.h>
 #include <atomic>
+#include <stdarg.h>
+#include <stdlib.h>
 
 #define KMP_FTN_PLAIN 1
 #define KMP_FTN_APPEND 2
@@ -69,7 +70,7 @@
 #error Unknown compiler
 #endif
 
-#if (KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_FREEBSD) && !KMP_OS_CNK
+#if (KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_FREEBSD) && !KMP_USE_ABT
 #define KMP_AFFINITY_SUPPORTED 1
 #if KMP_OS_WINDOWS && KMP_ARCH_X86_64
 #define KMP_GROUP_AFFINITY 1
@@ -200,6 +201,18 @@ typedef kmp_uint32 kmp_uint;
 #define KMP_INT_MAX ((kmp_int32)0x7FFFFFFF)
 #define KMP_INT_MIN ((kmp_int32)0x80000000)
 
+// stdarg handling
+#if (KMP_ARCH_ARM || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64) &&                   \
+    (KMP_OS_FREEBSD || KMP_OS_LINUX)
+typedef va_list *kmp_va_list;
+#define kmp_va_deref(ap) (*(ap))
+#define kmp_va_addr_of(ap) (&(ap))
+#else
+typedef va_list kmp_va_list;
+#define kmp_va_deref(ap) (ap)
+#define kmp_va_addr_of(ap) (ap)
+#endif
+
 #ifdef __cplusplus
 // macros to cast out qualifiers and to re-interpret types
 #define CCAST(type, var) const_cast<type>(var)
@@ -268,6 +281,16 @@ template <> struct traits_t<unsigned long long> {
 #define __forceinline __inline
 #endif
 
+/* Check if the OS/arch can support user-level mwait */
+// All mwait code tests for UMWAIT first, so it should only fall back to ring3
+// MWAIT for KNL.
+#define KMP_HAVE_MWAIT                                                         \
+  ((KMP_ARCH_X86 || KMP_ARCH_X86_64) && (KMP_OS_LINUX || KMP_OS_WINDOWS) &&    \
+   !KMP_MIC2)
+#define KMP_HAVE_UMWAIT                                                        \
+  ((KMP_ARCH_X86 || KMP_ARCH_X86_64) && (KMP_OS_LINUX || KMP_OS_WINDOWS) &&    \
+   !KMP_MIC)
+
 #if KMP_OS_WINDOWS
 #include <windows.h>
 
@@ -313,12 +336,24 @@ extern "C" {
 #  define KMP_FALLTHROUGH() [[fallthrough]]
 #elif __has_cpp_attribute(clang::fallthrough)
 #  define KMP_FALLTHROUGH() [[clang::fallthrough]]
-#elif __has_attribute(fallthough) || __GNUC__ >= 7
+#elif __has_attribute(fallthrough) || __GNUC__ >= 7
 #  define KMP_FALLTHROUGH() __attribute__((__fallthrough__))
 #else
 #  define KMP_FALLTHROUGH() ((void)0)
 #endif
 
+#if KMP_HAVE_ATTRIBUTE_WAITPKG
+#define KMP_ATTRIBUTE_TARGET_WAITPKG __attribute__((target("waitpkg")))
+#else
+#define KMP_ATTRIBUTE_TARGET_WAITPKG /* Nothing */
+#endif
+
+#if KMP_HAVE_ATTRIBUTE_RTM
+#define KMP_ATTRIBUTE_TARGET_RTM __attribute__((target("rtm")))
+#else
+#define KMP_ATTRIBUTE_TARGET_RTM /* Nothing */
+#endif
+
 // Define attribute that indicates a function does not return
 #if __cplusplus >= 201103L
 #define KMP_NORETURN [[noreturn]]
@@ -338,10 +373,16 @@ extern "C" {
 #define KMP_ALIAS(alias_of) __attribute__((alias(alias_of)))
 #endif
 
+#if KMP_HAVE_WEAK_ATTRIBUTE && !KMP_DYNAMIC_LIB
+#define KMP_WEAK_ATTRIBUTE_EXTERNAL __attribute__((weak))
+#else
+#define KMP_WEAK_ATTRIBUTE_EXTERNAL /* Nothing */
+#endif
+
 #if KMP_HAVE_WEAK_ATTRIBUTE
-#define KMP_WEAK_ATTRIBUTE __attribute__((weak))
+#define KMP_WEAK_ATTRIBUTE_INTERNAL __attribute__((weak))
 #else
-#define KMP_WEAK_ATTRIBUTE /* Nothing */
+#define KMP_WEAK_ATTRIBUTE_INTERNAL /* Nothing */
 #endif
 
 // Define KMP_VERSION_SYMBOL and KMP_EXPAND_NAME
diff --git a/runtime/src/kmp_platform.h b/runtime/src/kmp_platform.h
index 3238deafc..4296ca31d 100644
--- a/runtime/src/kmp_platform.h
+++ b/runtime/src/kmp_platform.h
@@ -22,7 +22,6 @@
 #define KMP_OS_OPENBSD 0
 #define KMP_OS_DARWIN 0
 #define KMP_OS_WINDOWS 0
-#define KMP_OS_CNK 0
 #define KMP_OS_HURD 0
 #define KMP_OS_UNIX 0 /* disjunction of KMP_OS_LINUX, KMP_OS_DARWIN etc. */
 
@@ -66,11 +65,6 @@
 #define KMP_OS_OPENBSD 1
 #endif
 
-#if (defined __bgq__)
-#undef KMP_OS_CNK
-#define KMP_OS_CNK 1
-#endif
-
 #if (defined __GNU__)
 #undef KMP_OS_HURD
 #define KMP_OS_HURD 1
@@ -93,9 +87,9 @@
 #define KMP_ARCH_X86 0
 #define KMP_ARCH_X86_64 0
 #define KMP_ARCH_AARCH64 0
-#define KMP_ARCH_PPC64_BE 0
-#define KMP_ARCH_PPC64_LE 0
-#define KMP_ARCH_PPC64 (KMP_ARCH_PPC64_LE || KMP_ARCH_PPC64_BE)
+#define KMP_ARCH_PPC64_ELFv1 0
+#define KMP_ARCH_PPC64_ELFv2 0
+#define KMP_ARCH_PPC64 (KMP_ARCH_PPC64_ELFv2 || KMP_ARCH_PPC64_ELFv1)
 #define KMP_ARCH_MIPS 0
 #define KMP_ARCH_MIPS64 0
 #define KMP_ARCH_RISCV64 0
@@ -118,12 +112,12 @@
 #undef KMP_ARCH_X86
 #define KMP_ARCH_X86 1
 #elif defined __powerpc64__
-#if defined __LITTLE_ENDIAN__
-#undef KMP_ARCH_PPC64_LE
-#define KMP_ARCH_PPC64_LE 1
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+#undef KMP_ARCH_PPC64_ELFv2
+#define KMP_ARCH_PPC64_ELFv2 1
 #else
-#undef KMP_ARCH_PPC64_BE
-#define KMP_ARCH_PPC64_BE 1
+#undef KMP_ARCH_PPC64_ELFv1
+#define KMP_ARCH_PPC64_ELFv1 1
 #endif
 #elif defined __aarch64__
 #undef KMP_ARCH_AARCH64
@@ -143,7 +137,7 @@
 #endif
 
 #if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7R__) ||                     \
-    defined(__ARM_ARCH_7A__)
+    defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7VE__)
 #define KMP_ARCH_ARMV7 1
 #endif
 
diff --git a/runtime/src/kmp_runtime.cpp b/runtime/src/kmp_runtime.cpp
index dd6e0ff70..e380b0c7f 100644
--- a/runtime/src/kmp_runtime.cpp
+++ b/runtime/src/kmp_runtime.cpp
@@ -32,6 +32,11 @@
 #include "ompt-specific.h"
 #endif
 
+#if OMPTARGET_PROFILING_SUPPORT
+#include "llvm/Support/TimeProfiler.h"
+static char *ProfileTraceFile = nullptr;
+#endif
+
 /* these are temporary issues to be dealt with */
 #define KMP_USE_PRCTL 0
 
@@ -41,6 +46,15 @@
 
 #include "tsan_annotations.h"
 
+#if KMP_OS_WINDOWS
+// windows does not need include files as it doesn't use shared memory
+#else
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#define SHM_SIZE 1024
+#endif
+
 #if defined(KMP_GOMP_COMPAT)
 char const __kmp_version_alt_comp[] =
     KMP_VERSION_PREFIX "alternative compiler support: yes";
@@ -76,8 +90,10 @@ static void __kmp_partition_places(kmp_team_t *team,
                                    int update_master_only = 0);
 #endif
 static void __kmp_do_serial_initialize(void);
+#if !KMP_USE_ABT
 void __kmp_fork_barrier(int gtid, int tid);
 void __kmp_join_barrier(int gtid);
+#endif
 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
                           kmp_internal_control_t *new_icvs, ident_t *loc);
 
@@ -89,20 +105,24 @@ static int __kmp_expand_threads(int nNeed);
 #if KMP_OS_WINDOWS
 static int __kmp_unregister_root_other_thread(int gtid);
 #endif
-static void __kmp_unregister_library(void); // called by __kmp_internal_end()
 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
+#if KMP_REMOVE_FORKJOIN_LOCK
+/* __kmp_thread_pool_insert_pt must be protected by __kmp_thread_pool_lock. */
+#endif
 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
 
 /* Calculate the identifier of the current thread */
 /* fast (and somewhat portable) way to get unique identifier of executing
    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
 int __kmp_get_global_thread_id() {
+#if !KMP_USE_ABT
   int i;
   kmp_info_t **other_threads;
   size_t stack_data;
   char *stack_addr;
   size_t stack_size;
   char *stack_base;
+#endif
 
   KA_TRACE(
       1000,
@@ -117,6 +137,14 @@ int __kmp_get_global_thread_id() {
   if (!TCR_4(__kmp_init_gtid))
     return KMP_GTID_DNE;
 
+#if KMP_USE_ABT
+
+  /* Argobots version always uses TLS to get a global thread id. */
+  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
+  return __kmp_gtid_get_specific();
+
+#else // KMP_USE_ABT
+
 #ifdef KMP_TDATA_GTID
   if (TCR_4(__kmp_gtid_mode) >= 3) {
     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
@@ -204,14 +232,28 @@ int __kmp_get_global_thread_id() {
                                  "th_%d stack (refinement)", i);
   }
   return i;
+
+#endif // !KMP_USE_ABT
 }
 
 int __kmp_get_global_thread_id_reg() {
   int gtid;
 
+#if KMP_USE_ABT
+  if (!__kmp_abt_init_global)
+    __kmp_abt_global_initialize();
+#endif
+
   if (!__kmp_init_serial) {
     gtid = KMP_GTID_DNE;
   } else
+#if KMP_USE_ABT
+  {
+    /* Argobots always uses TLS. */
+    KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
+    gtid = __kmp_gtid_get_specific();
+  }
+#else // KMP_USE_ABT
 #ifdef KMP_TDATA_GTID
       if (TCR_4(__kmp_gtid_mode) >= 3) {
     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
@@ -226,6 +268,7 @@ int __kmp_get_global_thread_id_reg() {
              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
     gtid = __kmp_get_global_thread_id();
   }
+#endif // !KMP_USE_ABT
 
   /* we must be a new uber master sibling thread */
   if (gtid == KMP_GTID_DNE) {
@@ -432,6 +475,7 @@ void __kmp_abort_process() {
     raise(SIGABRT);
     _exit(3); // Just in case, if signal ignored, exit anyway.
   } else {
+    __kmp_unregister_library();
     abort();
   }
 
@@ -459,6 +503,7 @@ static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
 
+#if !KMP_USE_ABT
   __kmp_print_storage_map_gtid(
       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
@@ -479,6 +524,7 @@ static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
                                gtid);
 #endif // KMP_FAST_REDUCTION_BARRIER
+#endif // !KMP_USE_ABT
 }
 
 /* Print out the storage map for the major kmp_team_t team data structures
@@ -490,6 +536,7 @@ static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
                                header, team_id);
 
+#if !KMP_USE_ABT
   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
                                &team->t.t_bar[bs_last_barrier],
                                sizeof(kmp_balign_team_t) * bs_last_barrier,
@@ -511,6 +558,7 @@ static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
                                sizeof(kmp_balign_team_t),
                                "%s_%d.t_bar[reduction]", header, team_id);
 #endif // KMP_FAST_REDUCTION_BARRIER
+#endif // !KMP_USE_ABT
 
   __kmp_print_storage_map_gtid(
       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
@@ -775,7 +823,9 @@ void __kmp_exit_single(int gtid) {
 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
                                  int master_tid, int set_nthreads,
                                  int enter_teams) {
+#if !KMP_REMOVE_FORKJOIN_LOCK
   int capacity;
+#endif
   int new_nthreads;
   KMP_DEBUG_ASSERT(__kmp_init_serial);
   KMP_DEBUG_ASSERT(root && parent_team);
@@ -899,6 +949,7 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
     new_nthreads = tl_nthreads;
   }
 
+#if !KMP_REMOVE_FORKJOIN_LOCK
   // Check if the threads array is large enough, or needs expanding.
   // See comment in __kmp_register_root() about the adjustment if
   // __kmp_threads[0] == NULL.
@@ -935,6 +986,10 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
       }
     }
   }
+#else // KMP_REMOVE_FORKJOIN_LOCK
+  // There is no fork/join lock, so we cannot calculate the exact number of
+  // threads needed for further parallel regions here. Let's ignore it.
+#endif // KMP_REMOVE_FORKJOIN_LOCK
 
 #ifdef KMP_DEBUG
   if (new_nthreads == 1) {
@@ -1021,15 +1076,23 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
       KMP_DEBUG_ASSERT(thr);
       KMP_DEBUG_ASSERT(thr->th.th_team == team);
       /* align team and thread arrived states */
+#if KMP_USE_ABT
+      KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
+                    "T#%d(%d:%d)\n",
+                    __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
+                    __kmp_gtid_from_tid(i, team), team->t.t_id, i));
+#else // KMP_USE_ABT
       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
                     team->t.t_bar[bs_plain_barrier].b_arrived));
+#endif // !KMP_USE_ABT
       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
       thr->th.th_teams_level = master_th->th.th_teams_level;
       thr->th.th_teams_size = master_th->th.th_teams_size;
+#if !KMP_USE_ABT
       { // Initialize threads' barrier data.
         int b;
         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
@@ -1041,6 +1104,7 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
 #endif
         }
       }
+#endif
     }
 
 #if KMP_AFFINITY_SUPPORTED
@@ -1062,7 +1126,7 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
   KMP_MB();
 }
 
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#if (KMP_ARCH_X86 || KMP_ARCH_X86_64) && !KMP_USE_ABT
 // Propagate any changes to the floating point control registers out to the team
 // We try to avoid unnecessary writes to the relevant cache line in the team
 // structure, so we don't make changes unless they are needed.
@@ -1122,7 +1186,7 @@ inline static void updateHWFPControl(kmp_team_t *team) {
 #else
 #define propagateFPControl(x) ((void)0)
 #define updateHWFPControl(x) ((void)0)
-#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+#endif /* (KMP_ARCH_X86 || KMP_ARCH_X86_64) && !KMP_USE_ABT */
 
 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
                                      int realloc); // forward declaration
@@ -1389,13 +1453,7 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
 int __kmp_fork_call(ident_t *loc, int gtid,
                     enum fork_context_e call_context, // Intel, GNU, ...
                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
-/* TODO: revert workaround for Intel(R) 64 tracker #96 */
-#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
-                    va_list *ap
-#else
-                    va_list ap
-#endif
-                    ) {
+                    kmp_va_list ap) {
   void **argv;
   int i;
   int master_tid;
@@ -1505,18 +1563,20 @@ int __kmp_fork_call(ident_t *loc, int gtid,
       parent_team->t.t_argc = argc;
       argv = (void **)parent_team->t.t_argv;
       for (i = argc - 1; i >= 0; --i)
-/* TODO: revert workaround for Intel(R) 64 tracker #96 */
-#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
-        *argv++ = va_arg(*ap, void *);
-#else
-        *argv++ = va_arg(ap, void *);
-#endif
+        *argv++ = va_arg(kmp_va_deref(ap), void *);
       // Increment our nested depth levels, but not increase the serialization
       if (parent_team == master_th->th.th_serial_team) {
         // AC: we are in serialized parallel
         __kmpc_serialized_parallel(loc, gtid);
         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
 
+        if (call_context == fork_context_gnu) {
+          // AC: need to decrement t_serialized for enquiry functions to work
+          // correctly, will restore at join time
+          parent_team->t.t_serialized--;
+          return TRUE;
+        }
+
 #if OMPT_SUPPORT
         void *dummy;
         void **exit_frame_p;
@@ -1588,7 +1648,6 @@ int __kmp_fork_call(ident_t *loc, int gtid,
 
       parent_team->t.t_pkfn = microtask;
       parent_team->t.t_invoke = invoker;
-      KMP_ATOMIC_INC(&root->r.r_in_parallel);
       parent_team->t.t_active_level++;
       parent_team->t.t_level++;
       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
@@ -1625,6 +1684,22 @@ int __kmp_fork_call(ident_t *loc, int gtid,
       }
 #endif
 
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+      if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
+           KMP_ITT_DEBUG) &&
+          __kmp_forkjoin_frames_mode == 3 &&
+          parent_team->t.t_active_level == 1 // only report frames at level 1
+          && master_th->th.th_teams_size.nteams == 1) {
+        kmp_uint64 tmp_time = __itt_get_timestamp();
+        master_th->th.th_frame_time = tmp_time;
+        parent_team->t.t_region_time = tmp_time;
+      }
+      if (__itt_stack_caller_create_ptr) {
+        // create new stack stitching id before entering fork barrier
+        parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
+      }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+
       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
                     "master_th=%p, gtid=%d\n",
                     root, parent_team, master_th, gtid));
@@ -1633,6 +1708,9 @@ int __kmp_fork_call(ident_t *loc, int gtid,
                     "master_th=%p, gtid=%d\n",
                     root, parent_team, master_th, gtid));
 
+      if (call_context == fork_context_gnu)
+        return TRUE;
+
       /* Invoke microtask for MASTER thread */
       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
                     parent_team->t.t_id, parent_team->t.t_pkfn));
@@ -1674,7 +1752,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
       // __kmp_reserve_threads() to speedup nested serialized parallels.
       if (nthreads > 1) {
         if ((get__max_active_levels(master_th) == 1 &&
-             (root->r.r_in_parallel && !enter_teams)) ||
+             (root->r.r_active && !enter_teams)) ||
             (__kmp_library == library_serial)) {
           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
                         " threads\n",
@@ -1684,7 +1762,9 @@ int __kmp_fork_call(ident_t *loc, int gtid,
       }
       if (nthreads > 1) {
         /* determine how many new threads we can use */
+#if !KMP_REMOVE_FORKJOIN_LOCK
         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+#endif
         /* AC: If we execute teams from parallel region (on host), then teams
            should be created but each can only have 1 thread if nesting is
            disabled. If teams called from serial region, then teams and their
@@ -1695,7 +1775,9 @@ int __kmp_fork_call(ident_t *loc, int gtid,
           // Free lock for single thread execution here; for multi-thread
           // execution it will be freed later after team of threads created
           // and initialized
+#if !KMP_REMOVE_FORKJOIN_LOCK
           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+#endif
         }
       }
     }
@@ -1804,12 +1886,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
           argv = (void **)team->t.t_argv;
           if (ap) {
             for (i = argc - 1; i >= 0; --i)
-// TODO: revert workaround for Intel(R) 64 tracker #96
-#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
-              *argv++ = va_arg(*ap, void *);
-#else
-              *argv++ = va_arg(ap, void *);
-#endif
+              *argv++ = va_arg(kmp_va_deref(ap), void *);
           } else {
             for (i = 0; i < argc; ++i)
               // Get args from parent team for teams construct
@@ -1840,12 +1917,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
         } else {
           argv = args;
           for (i = argc - 1; i >= 0; --i)
-// TODO: revert workaround for Intel(R) 64 tracker #96
-#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
-            *argv++ = va_arg(*ap, void *);
-#else
-            *argv++ = va_arg(ap, void *);
-#endif
+            *argv++ = va_arg(kmp_va_deref(ap), void *);
           KMP_MB();
 
 #if OMPT_SUPPORT
@@ -1950,11 +2022,6 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
     master_th->th.th_current_task->td_flags.executing = 0;
 
-    if (!master_th->th.th_teams_microtask || level > teams_level) {
-      /* Increment our nested depth level */
-      KMP_ATOMIC_INC(&root->r.r_in_parallel);
-    }
-
     // See if we need to make a copy of the ICVs.
     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
     if ((level + 1 < __kmp_nested_nth.used) &&
@@ -2130,12 +2197,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     argv = (void **)team->t.t_argv;
     if (ap) {
       for (i = argc - 1; i >= 0; --i) {
-// TODO: revert workaround for Intel(R) 64 tracker #96
-#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
-        void *new_argv = va_arg(*ap, void *);
-#else
-        void *new_argv = va_arg(ap, void *);
-#endif
+        void *new_argv = va_arg(kmp_va_deref(ap), void *);
         KMP_CHECK_UPDATE(*argv, new_argv);
         argv++;
       }
@@ -2159,7 +2221,9 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
 #endif
 
+#if !KMP_REMOVE_FORKJOIN_LOCK
     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+#endif
 
 #if USE_ITT_BUILD
     if (team->t.t_active_level == 1 // only report frames at level 1
@@ -2303,7 +2367,11 @@ void __kmp_join_call(ident_t *loc, int gtid
 
 #if OMPT_SUPPORT
   void *team_microtask = (void *)team->t.t_pkfn;
-  if (ompt_enabled.enabled) {
+  // For GOMP interface with serialized parallel, need the
+  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
+  // and end-parallel events.
+  if (ompt_enabled.enabled &&
+      !(team->t.t_serialized && fork_context == fork_context_gnu)) {
     master_th->th.ompt_thread_info.state = ompt_state_overhead;
   }
 #endif
@@ -2367,14 +2435,13 @@ void __kmp_join_call(ident_t *loc, int gtid
 
 #if USE_ITT_BUILD
   if (__itt_stack_caller_create_ptr) {
-    __kmp_itt_stack_caller_destroy(
-        (__itt_caller)team->t
-            .t_stack_id); // destroy the stack stitching id after join barrier
+    // destroy the stack stitching id after join barrier
+    __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
   }
-
   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
   if (team->t.t_active_level == 1 &&
-      !master_th->th.th_teams_microtask) { /* not in teams construct */
+      (!master_th->th.th_teams_microtask || /* not in teams construct */
+       master_th->th.th_teams_size.nteams == 1)) {
     master_th->th.th_ident = loc;
     // only one notification scheme (either "submit" or "forking/joined", not
     // both)
@@ -2414,7 +2481,6 @@ void __kmp_join_call(ident_t *loc, int gtid
     /* Decrement our nested depth level */
     team->t.t_level--;
     team->t.t_active_level--;
-    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
 
     // Restore number of threads in the team if needed. This code relies on
     // the proper adjustment of th_teams_size.nth after the fork in
@@ -2431,6 +2497,7 @@ void __kmp_join_call(ident_t *loc, int gtid
       // Adjust states of non-used threads of the team
       for (int i = old_num; i < new_num; ++i) {
         // Re-initialize thread's barrier data.
+#if !KMP_USE_ABT
         KMP_DEBUG_ASSERT(other_threads[i]);
         kmp_balign_t *balign = other_threads[i]->th.th_bar;
         for (int b = 0; b < bs_last_barrier; ++b) {
@@ -2444,6 +2511,7 @@ void __kmp_join_call(ident_t *loc, int gtid
           // Synchronize thread's task state
           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
         }
+#endif
       }
     }
 
@@ -2463,17 +2531,12 @@ void __kmp_join_call(ident_t *loc, int gtid
 
   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
 
+#if !KMP_REMOVE_FORKJOIN_LOCK
   /* jc: The following lock has instructions with REL and ACQ semantics,
      separating the parallel user code called in this parallel region
      from the serial user code called after this function returns. */
   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
-
-  if (!master_th->th.th_teams_microtask ||
-      team->t.t_level > master_th->th.th_teams_level) {
-    /* Decrement our nested depth level */
-    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
-  }
-  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
+#endif
 
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
@@ -2557,7 +2620,9 @@ void __kmp_join_call(ident_t *loc, int gtid
   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
   master_th->th.th_current_task->td_flags.executing = 1;
 
+#if !KMP_REMOVE_FORKJOIN_LOCK
   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+#endif
 
 #if OMPT_SUPPORT
   int flags =
@@ -2629,6 +2694,7 @@ void __kmp_set_num_threads(int new_nth, int gtid) {
 
   set__nproc(thread, new_nth);
 
+#if !KMP_REMOVE_FORKJOIN_LOCK
   // If this omp_set_num_threads() call will cause the hot team size to be
   // reduced (in the absence of a num_threads clause), then reduce it now,
   // rather than waiting for the next parallel region.
@@ -2673,6 +2739,9 @@ void __kmp_set_num_threads(int new_nth, int gtid) {
     // Special flag in case omp_set_num_threads() call
     hot_team->t.t_size_changed = -1;
   }
+#else
+  // Since hot_team is not protected by __kmp_forkjoin_lock, we can't modify it.
+#endif
 }
 
 /* Changes max_active_levels */
@@ -3037,6 +3106,32 @@ static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
   }
 }
 
+#if KMP_USE_ABT
+/* This function is used to initialize task queues for implicit tasks. */
+static inline void __kmp_abt_init_task_queues(kmp_taskdata_t *tds, int num) {
+  int i;
+  for (i = 0; i < num; i++) {
+    tds[i].td_task_queue = NULL;
+    tds[i].td_tq_cur_size = 0;
+    tds[i].td_tq_max_size = 0;
+  }
+}
+
+/* This function is used to deallocate task queues for implicit tasks. */
+static inline  void __kmp_abt_fini_task_queues(kmp_taskdata_t *tds, int num) {
+  int i;
+  for (i = 0; i < num; i++) {
+    kmp_taskdata_t *td = &tds[i];
+    if (td->td_task_queue) {
+      KMP_DEBUG_ASSERT(td->td_tq_cur_size == 0);
+      KMP_INTERNAL_FREE(td->td_task_queue);
+      td->td_task_queue = NULL;
+      td->td_tq_max_size = 0;
+    }
+  }
+}
+#endif // KMP_USE_ABT
+
 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
   int i;
   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
@@ -3050,6 +3145,10 @@ static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
   team->t.t_max_nproc = max_nth;
 
+#if KMP_USE_ABT
+  __kmp_abt_init_task_queues(team->t.t_implicit_task_taskdata, max_nth);
+#endif
+
   /* setup dispatch buffers */
   for (i = 0; i < num_disp_buff; ++i) {
     team->t.t_disp_buffer[i].buffer_index = i;
@@ -3066,6 +3165,10 @@ static void __kmp_free_team_arrays(kmp_team_t *team) {
       team->t.t_dispatch[i].th_disp_buffer = NULL;
     }
   }
+#if KMP_USE_ABT
+  __kmp_abt_fini_task_queues(team->t.t_implicit_task_taskdata,
+                             team->t.t_max_nproc);
+#endif
 #if KMP_USE_HIER_SCHED
   __kmp_dispatch_free_hierarchies(team);
 #endif
@@ -3082,6 +3185,10 @@ static void __kmp_free_team_arrays(kmp_team_t *team) {
 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
   kmp_info_t **oldThreads = team->t.t_threads;
 
+#if KMP_USE_ABT
+  __kmp_abt_fini_task_queues(team->t.t_implicit_task_taskdata,
+                             team->t.t_max_nproc);
+#endif
   __kmp_free(team->t.t_disp_buffer);
   __kmp_free(team->t.t_dispatch);
   __kmp_free(team->t.t_implicit_task_taskdata);
@@ -3153,7 +3260,6 @@ static void __kmp_initialize_root(kmp_root_t *root) {
   __kmp_init_lock(&root->r.r_begin_lock);
   root->r.r_begin = FALSE;
   root->r.r_active = FALSE;
-  root->r.r_in_parallel = 0;
   root->r.r_blocktime = __kmp_dflt_blocktime;
 
   /* setup the root team for this task */
@@ -3171,6 +3277,9 @@ static void __kmp_initialize_root(kmp_root_t *root) {
                           0 // argc
                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
                           );
+#if KMP_USE_ABT
+  root_team->t.t_proc_bind_applied = proc_bind_default;
+#endif
 #if USE_DEBUGGER
   // Non-NULL value should be assigned to make the debugger display the root
   // team.
@@ -3371,8 +3480,6 @@ void __kmp_print_structure(void) {
         __kmp_print_structure_thread("    Uber Thread:  ",
                                      root->r.r_uber_thread);
         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
-        __kmp_printf("    In Parallel:  %2d\n",
-                     KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
         __kmp_printf("\n");
         __kmp_print_structure_team_accum(list, root->r.r_root_team);
         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
@@ -3441,7 +3548,7 @@ static const unsigned __kmp_primes[] = {
 //  __kmp_get_random: Get a random number using a linear congruential method.
 unsigned short __kmp_get_random(kmp_info_t *thread) {
   unsigned x = thread->th.th_x;
-  unsigned short r = x >> 16;
+  unsigned short r = (unsigned short)(x >> 16);
 
   thread->th.th_x = x * thread->th.th_a + 1;
 
@@ -3529,7 +3636,7 @@ static int __kmp_expand_threads(int nNeed) {
   // > __kmp_max_nth in one of two ways:
   //
   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
-  //    may not be resused by another thread, so we may need to increase
+  //    may not be reused by another thread, so we may need to increase
   //    __kmp_threads_capacity to __kmp_max_nth + 1.
   //
   // 2) New foreign root(s) are encountered.  We always register new foreign
@@ -3615,6 +3722,7 @@ int __kmp_register_root(int initial_thread) {
     --capacity;
   }
 
+#if !KMP_REMOVE_FORKJOIN_LOCK
   /* see if there are too many threads */
   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
     if (__kmp_tp_cached) {
@@ -3626,20 +3734,42 @@ int __kmp_register_root(int initial_thread) {
                   __kmp_msg_null);
     }
   }
+#else
+  // At least, the length should be more than 1.
+  __kmp_acquire_bootstrap_lock(&__kmp_threads_lock);
+  __kmp_expand_threads(1);
+  __kmp_release_bootstrap_lock(&__kmp_threads_lock);
+#endif
 
+#if KMP_REMOVE_FORKJOIN_LOCK
+  __kmp_acquire_bootstrap_lock(&__kmp_threads_lock);
+#endif
   /* find an available thread slot */
   /* Don't reassign the zero slot since we need that to only be used by initial
      thread */
   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
        gtid++)
+#if !KMP_REMOVE_FORKJOIN_LOCK
     ;
+#else
+    {
+      // If the capacity of __kmp_threads is not enough, expands it here.
+      if (gtid - 1 >= __kmp_threads_capacity)
+        __kmp_expand_threads(__kmp_threads_capacity * 2);
+    }
+#endif
   KA_TRACE(1,
            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
   KMP_ASSERT(gtid < __kmp_threads_capacity);
 
   /* update global accounting */
+#if !KMP_REMOVE_FORKJOIN_LOCK
   __kmp_all_nth++;
   TCW_4(__kmp_nth, __kmp_nth + 1);
+#else
+  KMP_TEST_THEN_INC32(&__kmp_all_nth);
+  KMP_TEST_THEN_INC32(&__kmp_nth);
+#endif
 
   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
   // numbers of procs, and method #2 (keyed API call) for higher numbers.
@@ -3725,6 +3855,10 @@ int __kmp_register_root(int initial_thread) {
   /* drop root_thread into place */
   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
 
+#if KMP_REMOVE_FORKJOIN_LOCK
+  __kmp_release_bootstrap_lock(&__kmp_threads_lock);
+#endif
+
   root->r.r_root_team->t.t_threads[0] = root_thread;
   root->r.r_hot_team->t.t_threads[0] = root_thread;
   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
@@ -3736,6 +3870,17 @@ int __kmp_register_root(int initial_thread) {
   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
   TCW_4(__kmp_init_gtid, TRUE);
 
+#if KMP_USE_ABT
+
+  /* Mark root_thread as active */
+  TCW_4(root_thread->th.th_active, TRUE);
+  /* prepare the master thread for get_gtid() */
+  root_thread->th.th_info.ds.ds_gtid = gtid;
+  __kmp_abt_create_uber(gtid, root_thread, __kmp_stksize);
+  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
+
+#else // KMP_USE_ABT
+
   /* prepare the master thread for get_gtid() */
   __kmp_gtid_set_specific(gtid);
 
@@ -3766,6 +3911,8 @@ int __kmp_register_root(int initial_thread) {
   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
                    KMP_INIT_BARRIER_STATE);
 
+#endif // !KMP_USE_ABT
+
 #if KMP_AFFINITY_SUPPORTED
   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
@@ -3910,8 +4057,12 @@ static int __kmp_reset_root(int gtid, kmp_root_t *root) {
   }
 #endif
 
+#if !KMP_REMOVE_FORKJOIN_LOCK
   TCW_4(__kmp_nth,
         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
+#else
+  KMP_TEST_THEN_DEC32(&__kmp_nth);
+#endif
   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
                  " to %d\n",
@@ -3925,10 +4076,18 @@ static int __kmp_reset_root(int gtid, kmp_root_t *root) {
     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
     root->r.r_uber_thread->th.th_cg_roots = NULL;
   }
+#if KMP_REMOVE_FORKJOIN_LOCK
+  /* __kmp_reap_thread requires __kmp_thread_pool_lock. */
+  __kmp_acquire_bootstrap_lock(&__kmp_thread_pool_lock);
+#endif
   __kmp_reap_thread(root->r.r_uber_thread, 1);
+#if KMP_REMOVE_FORKJOIN_LOCK
+  /* __kmp_reap_thread requires __kmp_thread_pool_lock. */
+  __kmp_release_bootstrap_lock(&__kmp_thread_pool_lock);
+#endif
 
-  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
-  // of freeing.
+  // We canot put root thread to __kmp_thread_pool, so we have to reap it
+  // instead of freeing.
   root->r.r_uber_thread = NULL;
   /* mark root as no longer in use */
   root->r.r_begin = FALSE;
@@ -3974,9 +4133,8 @@ void __kmp_unregister_root_current_thread(int gtid) {
   __kmp_reset_root(gtid, root);
 
   /* free up this thread slot */
-  __kmp_gtid_set_specific(KMP_GTID_DNE);
-#ifdef KMP_TDATA_GTID
-  __kmp_gtid = KMP_GTID_DNE;
+#if KMP_USE_ABT
+  __kmp_abt_set_self_info(NULL);
 #endif
 
   KMP_MB();
@@ -4198,6 +4356,10 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
 #endif
   KMP_MB();
 
+#if KMP_REMOVE_FORKJOIN_LOCK
+  __kmp_acquire_bootstrap_lock(&__kmp_thread_pool_lock);
+#endif
+
   /* first, try to get one from the thread pool */
   if (__kmp_thread_pool) {
     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
@@ -4220,12 +4382,20 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
     KMP_ASSERT(!new_thr->th.th_team);
     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
 
+#if KMP_REMOVE_FORKJOIN_LOCK
+    __kmp_release_bootstrap_lock(&__kmp_thread_pool_lock);
+#endif
+
     /* setup the thread structure */
     __kmp_initialize_info(new_thr, team, new_tid,
                           new_thr->th.th_info.ds.ds_gtid);
     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
 
+#if !KMP_REMOVE_FORKJOIN_LOCK
     TCW_4(__kmp_nth, __kmp_nth + 1);
+#else
+    KMP_TEST_THEN_INC32(&__kmp_nth);
+#endif
 
     new_thr->th.th_task_state = 0;
     new_thr->th.th_task_state_top = 0;
@@ -4241,7 +4411,7 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
     }
 #endif /* KMP_ADJUST_BLOCKTIME */
 
-#if KMP_DEBUG
+#if !KMP_USE_ABT && KMP_DEBUG
     // If thread entered pool via __kmp_free_thread, wait_flag should !=
     // KMP_BARRIER_PARENT_FLAG.
     int b;
@@ -4257,8 +4427,17 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
     return new_thr;
   }
 
+#if KMP_REMOVE_FORKJOIN_LOCK
+  __kmp_release_bootstrap_lock(&__kmp_thread_pool_lock);
+#endif
+
   /* no, well fork a new one */
+#if KMP_REMOVE_FORKJOIN_LOCK
+  // __kmp_forkjoin_lock maintains the consistency of __kmp_nth and
+  // __kmp_all_nth; the following condition does not hold without this lock.
+#else
   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
+#endif
   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
 
 #if KMP_USE_MONITOR
@@ -4291,9 +4470,18 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
   }
 #endif
 
+#if KMP_REMOVE_FORKJOIN_LOCK
+  __kmp_acquire_bootstrap_lock(&__kmp_threads_lock);
+#endif
+
   KMP_MB();
   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
+#if KMP_REMOVE_FORKJOIN_LOCK
+    // If the length of __kmp_threads is not enough, expands here.
+    if (new_gtid - 1 >= __kmp_threads_capacity)
+      __kmp_expand_threads(__kmp_threads_capacity * 2);
+#endif
   }
 
   /* allocate space for it. */
@@ -4301,6 +4489,43 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
 
   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
 
+#if KMP_REMOVE_FORKJOIN_LOCK
+  __kmp_release_bootstrap_lock(&__kmp_threads_lock);
+#endif
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
+  // suppress race conditions detection on synchronization flags in debug mode
+  // this helps to analyze library internals eliminating false positives
+  __itt_suppress_mark_range(
+      __itt_suppress_range, __itt_suppress_threading_errors,
+      &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
+  __itt_suppress_mark_range(
+      __itt_suppress_range, __itt_suppress_threading_errors,
+      &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
+#if KMP_OS_WINDOWS
+  __itt_suppress_mark_range(
+      __itt_suppress_range, __itt_suppress_threading_errors,
+      &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
+#else
+  __itt_suppress_mark_range(__itt_suppress_range,
+                            __itt_suppress_threading_errors,
+                            &new_thr->th.th_suspend_init_count,
+                            sizeof(new_thr->th.th_suspend_init_count));
+#endif
+  // TODO: check if we need to also suppress b_arrived flags
+  __itt_suppress_mark_range(__itt_suppress_range,
+                            __itt_suppress_threading_errors,
+                            CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
+                            sizeof(new_thr->th.th_bar[0].bb.b_go));
+  __itt_suppress_mark_range(__itt_suppress_range,
+                            __itt_suppress_threading_errors,
+                            CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
+                            sizeof(new_thr->th.th_bar[1].bb.b_go));
+  __itt_suppress_mark_range(__itt_suppress_range,
+                            __itt_suppress_threading_errors,
+                            CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
+                            sizeof(new_thr->th.th_bar[2].bb.b_go));
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
   if (__kmp_storage_map) {
     __kmp_print_thread_storage_map(new_thr, new_gtid);
   }
@@ -4344,6 +4569,7 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
 
+#if !KMP_USE_ABT
   int b;
   kmp_balign_t *balign = new_thr->th.th_bar;
   for (b = 0; b < bs_last_barrier; ++b) {
@@ -4352,6 +4578,7 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
     balign[b].bb.use_oncore_barrier = 0;
   }
+#endif
 
   new_thr->th.th_spin_here = FALSE;
   new_thr->th.th_next_waiting = 0;
@@ -4373,9 +4600,25 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
   new_thr->th.th_active_in_pool = FALSE;
   TCW_4(new_thr->th.th_active, TRUE);
 
+#if !KMP_REMOVE_FORKJOIN_LOCK
   /* adjust the global counters */
   __kmp_all_nth++;
   __kmp_nth++;
+#else
+  KMP_TEST_THEN_INC32(&__kmp_all_nth);
+  KMP_TEST_THEN_INC32(&__kmp_nth);
+#endif
+
+#if KMP_USE_ABT
+
+  // We don't fork the new work thread (will do it later) but set gtid.
+  new_thr->th.th_info.ds.ds_thread = ABT_THREAD_NULL;
+  new_thr->th.th_info.ds.ds_gtid = new_gtid;
+
+  new_thr->th.th_current_place_id = -1;
+  new_thr->th.th_creation_group_end_tid = -1;
+
+#else // KMP_USE_ABT
 
   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
   // numbers of procs, and method #2 (keyed API call) for higher numbers.
@@ -4408,6 +4651,8 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
   KF_TRACE(10,
            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
 
+#endif // !KMP_USE_ABT
+
   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
                 new_gtid));
   KMP_MB();
@@ -4467,7 +4712,7 @@ static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
   team->t.t_sched.sched = new_icvs->sched.sched;
 
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#if (KMP_ARCH_X86 || KMP_ARCH_X86_64) && !KMP_USE_ABT
   team->t.t_fp_control_saved = FALSE; /* not needed */
   team->t.t_x87_fpu_control_word = 0; /* not needed */
   team->t.t_mxcsr = 0; /* not needed */
@@ -4515,11 +4760,11 @@ __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
 #if KMP_AFFINITY_SUPPORTED
 
 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
-// It calculats the worker + master thread's partition based upon the parent
+// It calculates the worker + master thread's partition based upon the parent
 // thread's partition, and binds each worker to a thread in their partition.
 // The master thread's partition should already include its current binding.
 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
-  // Copy the master thread's place partion to the team struct
+  // Copy the master thread's place partition to the team struct
   kmp_info_t *master_th = team->t.t_threads[0];
   KMP_DEBUG_ASSERT(master_th != NULL);
   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
@@ -4902,12 +5147,15 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
     }
     hot_teams = master->th.th_hot_teams;
     if (level < __kmp_hot_teams_max_level && hot_teams &&
-        hot_teams[level]
-            .hot_team) { // hot team has already been allocated for given level
+        hot_teams[level].hot_team) {
+      // hot team has already been allocated for given level
       use_hot_team = 1;
     } else {
       use_hot_team = 0;
     }
+  } else {
+    // check we won't access uninitialized hot_teams, just in case
+    KMP_DEBUG_ASSERT(new_nproc == 1);
   }
 #endif
   // Optimization to use a "hot" team
@@ -4996,6 +5244,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 #if KMP_NESTED_HOT_TEAMS
       } // (__kmp_hot_teams_mode == 0)
       else {
+#if !KMP_USE_ABT
         // When keeping extra threads in team, switch threads to wait on own
         // b_go flag
         for (f = new_nproc; f < team->t.t_nproc; ++f) {
@@ -5008,6 +5257,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
           }
         }
+#endif // !KMP_USE_ABT
       }
 #endif // KMP_NESTED_HOT_TEAMS
       team->t.t_nproc = new_nproc;
@@ -5058,6 +5308,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       int avail_threads = hot_teams[level].hot_team_nth;
       if (new_nproc < avail_threads)
         avail_threads = new_nproc;
+#if !KMP_USE_ABT
       kmp_info_t **other_threads = team->t.t_threads;
       for (f = team->t.t_nproc; f < avail_threads; ++f) {
         // Adjust barrier data of reserved threads (if any) of the team
@@ -5072,6 +5323,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 #endif
         }
       }
+#endif // !KMP_USE_ABT
       if (hot_teams[level].hot_team_nth >= new_nproc) {
         // we have all needed threads in reserve, no need to allocate any
         // this only possible in mode 1, cannot have reserved threads in mode 0
@@ -5104,6 +5356,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
           KMP_DEBUG_ASSERT(new_worker);
           team->t.t_threads[f] = new_worker;
 
+#if !KMP_USE_ABT
           KA_TRACE(20,
                    ("__kmp_allocate_team: team %d init T#%d arrived: "
                     "join=%llu, plain=%llu\n",
@@ -5123,6 +5376,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 #endif
             }
           }
+#endif // !KMP_USE_ABT
         }
 
 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
@@ -5157,7 +5411,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
           team->t.t_threads[f]->th.th_task_state =
               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
       } else { // set th_task_state for new threads in non-nested hot team
-        int old_state =
+        kmp_uint8 old_state =
             team->t.t_threads[0]->th.th_task_state; // copy master's state
         for (f = old_nproc; f < team->t.t_nproc; ++f)
           team->t.t_threads[f]->th.th_task_state = old_state;
@@ -5187,7 +5441,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         thr->th.th_teams_size = master->th.th_teams_size;
       }
     }
-#if KMP_NESTED_HOT_TEAMS
+#if KMP_NESTED_HOT_TEAMS && !KMP_USE_ABT
     if (level) {
       // Sync barrier state for nested hot teams, not needed for outermost hot
       // team.
@@ -5204,7 +5458,11 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         }
       }
     }
-#endif // KMP_NESTED_HOT_TEAMS
+#endif // KMP_NESTED_HOT_TEAMS && !KMP_USE_ABT
+#if KMP_USE_ABT
+    /* reinit the barrier */
+    ABT_barrier_reinit(team->t.t_team_bar, new_nproc);
+#endif
 
     /* reallocate space for arguments if necessary */
     __kmp_alloc_argv_entries(argc, team, TRUE);
@@ -5232,6 +5490,9 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
   }
 
   /* next, let's try to take one from the team pool */
+#if KMP_REMOVE_FORKJOIN_LOCK
+  __kmp_acquire_bootstrap_lock(&__kmp_team_pool_lock);
+#endif
   KMP_MB();
   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
     /* TODO: consider resizing undersized teams instead of reaping them, now
@@ -5240,6 +5501,10 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       /* take this team from the team pool */
       __kmp_team_pool = team->t.t_next_pool;
 
+#if KMP_REMOVE_FORKJOIN_LOCK
+      __kmp_release_bootstrap_lock(&__kmp_team_pool_lock);
+#endif
+
       /* setup the team for fresh use */
       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
 
@@ -5256,6 +5521,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       KA_TRACE(
           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
+#if !KMP_USE_ABT
       { // Initialize barrier data.
         int b;
         for (b = 0; b < bs_last_barrier; ++b) {
@@ -5266,6 +5532,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 #endif
         }
       }
+#endif
 
       team->t.t_proc_bind = new_proc_bind;
 
@@ -5288,6 +5555,9 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
     team = __kmp_reap_team(team);
     __kmp_team_pool = team;
   }
+#if KMP_REMOVE_FORKJOIN_LOCK
+  __kmp_release_bootstrap_lock(&__kmp_team_pool_lock);
+#endif
 
   /* nothing available in the pool, no matter, make a new team! */
   KMP_MB();
@@ -5321,6 +5591,8 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
   KA_TRACE(20,
            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
+#if !KMP_USE_ABT
+
   { // Initialize barrier data.
     int b;
     for (b = 0; b < bs_last_barrier; ++b) {
@@ -5332,6 +5604,13 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
     }
   }
 
+#else // KMP_USE_ABT
+
+  /* initialize the team barrier */
+  ABT_barrier_create(new_nproc, &team->t.t_team_bar);
+
+#endif // KMP_USE_ABT
+
   team->t.t_proc_bind = new_proc_bind;
 
 #if OMPT_SUPPORT
@@ -5357,7 +5636,6 @@ void __kmp_free_team(kmp_root_t *root,
   int f;
   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
                 team->t.t_id));
-
   /* verify state */
   KMP_DEBUG_ASSERT(root);
   KMP_DEBUG_ASSERT(team);
@@ -5401,6 +5679,16 @@ void __kmp_free_team(kmp_root_t *root,
   if (!use_hot_team) {
     if (__kmp_tasking_mode != tskm_immediate_exec) {
       // Wait for threads to reach reapable state
+#if KMP_USE_ABT
+      // Wait for all tasks.
+      for (f = 1; f < team->t.t_nproc; ++f) {
+        KMP_DEBUG_ASSERT(team->t.t_threads[f]);
+        kmp_info_t *th = team->t.t_threads[f];
+        __kmp_abt_wait_child_tasks(th, true, 0);
+        // Now it is safe to reap this thread.
+        th->th.th_reap_state = KMP_SAFE_TO_REAP;
+      }
+#else // KMP_USE_ABT
       for (f = 1; f < team->t.t_nproc; ++f) {
         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
         kmp_info_t *th = team->t.t_threads[f];
@@ -5415,12 +5703,13 @@ void __kmp_free_team(kmp_root_t *root,
           }
 #endif
           // first check if thread is sleeping
-          kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
+          kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
           if (fl.is_sleeping())
             fl.resume(__kmp_gtid_from_thread(th));
           KMP_CPU_PAUSE();
         }
       }
+#endif // !KMP_USE_ABT
 
       // Delete task teams
       int tt_idx;
@@ -5455,10 +5744,16 @@ void __kmp_free_team(kmp_root_t *root,
       team->t.t_threads[f] = NULL;
     }
 
+#if KMP_REMOVE_FORKJOIN_LOCK
+    __kmp_acquire_bootstrap_lock(&__kmp_team_pool_lock);
+#endif
     /* put the team back in the team pool */
     /* TODO limit size of team pool, call reap_team if pool too large */
     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
     __kmp_team_pool = (volatile kmp_team_t *)team;
+#if KMP_REMOVE_FORKJOIN_LOCK
+    __kmp_release_bootstrap_lock(&__kmp_team_pool_lock);
+#endif
   } else { // Check if team was created for the masters in a teams construct
     // See if first worker is a CG root
     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
@@ -5503,6 +5798,9 @@ kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
   /* TODO clean the threads that are a part of this? */
 
   /* free stuff */
+#if KMP_USE_ABT
+  ABT_barrier_free(&team->t.t_team_bar);
+#endif
   __kmp_free_team_arrays(team);
   if (team->t.t_argv != &team->t.t_inline_argv[0])
     __kmp_free((void *)team->t.t_argv);
@@ -5536,7 +5834,7 @@ kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
 // locality problems on programs where the size of the hot team regularly
 // grew and shrunk.
 //
-// Now, for single-level parallelism, the OMP tid is alway == gtid.
+// Now, for single-level parallelism, the OMP tid is always == gtid.
 void __kmp_free_thread(kmp_info_t *this_th) {
   int gtid;
   kmp_info_t **scan;
@@ -5546,6 +5844,7 @@ void __kmp_free_thread(kmp_info_t *this_th) {
 
   KMP_DEBUG_ASSERT(this_th);
 
+#if !KMP_USE_ABT
   // When moving thread to pool, switch thread to wait on own b_go flag, and
   // uninitialized (NULL team).
   int b;
@@ -5556,6 +5855,7 @@ void __kmp_free_thread(kmp_info_t *this_th) {
     balign[b].bb.team = NULL;
     balign[b].bb.leaf_kids = 0;
   }
+#endif // !KMP_USE_ABT
   this_th->th.th_task_state = 0;
   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
 
@@ -5595,6 +5895,10 @@ void __kmp_free_thread(kmp_info_t *this_th) {
   __kmp_free_implicit_task(this_th);
   this_th->th.th_current_task = NULL;
 
+#if KMP_REMOVE_FORKJOIN_LOCK
+  __kmp_acquire_bootstrap_lock(&__kmp_thread_pool_lock);
+#endif
+
   // If the __kmp_thread_pool_insert_pt is already past the new insert
   // point, then we need to re-scan the entire list.
   gtid = this_th->th.th_info.ds.ds_gtid;
@@ -5609,7 +5913,7 @@ void __kmp_free_thread(kmp_info_t *this_th) {
   // scan is the address of a link in the list, possibly the address of
   // __kmp_thread_pool itself.
   //
-  // In the absence of nested parallism, the for loop will have 0 iterations.
+  // In the absence of nested parallelism, the for loop will have 0 iterations.
   if (__kmp_thread_pool_insert_pt != NULL) {
     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
   } else {
@@ -5640,7 +5944,15 @@ void __kmp_free_thread(kmp_info_t *this_th) {
 #endif
   __kmp_unlock_suspend_mx(this_th);
 
+#if KMP_REMOVE_FORKJOIN_LOCK
+  __kmp_release_bootstrap_lock(&__kmp_thread_pool_lock);
+#endif
+
+#if !KMP_REMOVE_FORKJOIN_LOCK
   TCW_4(__kmp_nth, __kmp_nth - 1);
+#else
+  KMP_TEST_THEN_DEC32(&__kmp_nth);
+#endif
 
 #ifdef KMP_ADJUST_BLOCKTIME
   /* Adjust blocktime back to user setting or default if necessary */
@@ -5658,7 +5970,15 @@ void __kmp_free_thread(kmp_info_t *this_th) {
 
 /* ------------------------------------------------------------------------ */
 
+#if !KMP_USE_ABT
 void *__kmp_launch_thread(kmp_info_t *this_thr) {
+#if OMPTARGET_PROFILING_SUPPORT
+  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
+  // TODO: add a configuration option for time granularity
+  if (ProfileTraceFile)
+    llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
+#endif
+
   int gtid = this_thr->th.th_info.ds.ds_gtid;
   /*    void                 *stack_data;*/
   kmp_team_t **volatile pteam;
@@ -5759,56 +6079,34 @@ void *__kmp_launch_thread(kmp_info_t *this_thr) {
 
   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
   KMP_MB();
+
+#if OMPTARGET_PROFILING_SUPPORT
+  llvm::timeTraceProfilerFinishThread();
+#endif
   return this_thr;
 }
+#endif // !KMP_USE_ABT
 
 /* ------------------------------------------------------------------------ */
 
 void __kmp_internal_end_dest(void *specific_gtid) {
-#if KMP_COMPILER_ICC
-#pragma warning(push)
-#pragma warning(disable : 810) // conversion from "void *" to "int" may lose
-// significant bits
-#endif
   // Make sure no significant bits are lost
-  int gtid = (kmp_intptr_t)specific_gtid - 1;
-#if KMP_COMPILER_ICC
-#pragma warning(pop)
-#endif
+  int gtid;
+  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
 
   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
    * this is because 0 is reserved for the nothing-stored case */
 
-  /* josh: One reason for setting the gtid specific data even when it is being
-     destroyed by pthread is to allow gtid lookup through thread specific data
-     (__kmp_gtid_get_specific).  Some of the code, especially stat code,
-     that gets executed in the call to __kmp_internal_end_thread, actually
-     gets the gtid through the thread specific data.  Setting it here seems
-     rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
-     to run smoothly.
-     todo: get rid of this after we remove the dependence on
-     __kmp_gtid_get_specific  */
-  if (gtid >= 0 && KMP_UBER_GTID(gtid))
-    __kmp_gtid_set_specific(gtid);
-#ifdef KMP_TDATA_GTID
-  __kmp_gtid = gtid;
-#endif
   __kmp_internal_end_thread(gtid);
 }
 
 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
 
-// 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
-// destructors work perfectly, but in real libomp.so I have no evidence it is
-// ever called. However, -fini linker option in makefile.mk works fine.
-
 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
   __kmp_internal_end_atexit();
 }
 
-void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
-
 #endif
 
 /* [Windows] josh: when the atexit handler is called, there may still be more
@@ -5845,7 +6143,11 @@ void __kmp_internal_end_atexit(void) {
 }
 
 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
+#if !KMP_REMOVE_FORKJOIN_LOCK
   // It is assumed __kmp_forkjoin_lock is acquired.
+#else
+  // It is assumed __kmp_thread_pool_lock is acquired.
+#endif
 
   int gtid;
 
@@ -5862,8 +6164,11 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
       /* Need release fence here to prevent seg faults for tree forkjoin barrier
        * (GEH) */
       ANNOTATE_HAPPENS_BEFORE(thread);
-      kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
+#if !KMP_USE_ABT
+      kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
+                         thread);
       __kmp_release_64(&flag);
+#endif
     }
 
     // Terminate OS thread.
@@ -5898,7 +6203,11 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
 
+#if !KMP_REMOVE_FORKJOIN_LOCK
   --__kmp_all_nth;
+#else
+  KMP_TEST_THEN_DEC32(&__kmp_all_nth);
+#endif
 // __kmp_nth was decremented when thread is added to the pool.
 
 #ifdef KMP_ADJUST_BLOCKTIME
@@ -6016,6 +6325,9 @@ static void __kmp_internal_end(void) {
     KMP_MB();
 
     // Reap the worker threads.
+#if KMP_REMOVE_FORKJOIN_LOCK
+    __kmp_acquire_bootstrap_lock(&__kmp_thread_pool_lock);
+#endif
     // This is valid for now, but be careful if threads are reaped sooner.
     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
       // Get the next thread from the pool.
@@ -6028,8 +6340,14 @@ static void __kmp_internal_end(void) {
       __kmp_reap_thread(thread, 0);
     }
     __kmp_thread_pool_insert_pt = NULL;
+#if KMP_REMOVE_FORKJOIN_LOCK
+    __kmp_release_bootstrap_lock(&__kmp_thread_pool_lock);
+#endif
 
     // Reap teams.
+#if KMP_REMOVE_FORKJOIN_LOCK
+    __kmp_acquire_bootstrap_lock(&__kmp_team_pool_lock);
+#endif
     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
       // Get the next team from the pool.
       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
@@ -6038,6 +6356,9 @@ static void __kmp_internal_end(void) {
       team->t.t_next_pool = NULL;
       __kmp_reap_team(team);
     }
+#if KMP_REMOVE_FORKJOIN_LOCK
+    __kmp_release_bootstrap_lock(&__kmp_team_pool_lock);
+#endif
 
     __kmp_reap_task_teams();
 
@@ -6094,7 +6415,7 @@ void __kmp_internal_end_library(int gtid_req) {
      only place to clear __kmp_serial_init */
   /* we'll check this later too, after we get the lock */
   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
-  // redundaant, because the next check will work in any case.
+  // redundant, because the next check will work in any case.
   if (__kmp_global.g.g_abort) {
     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
     /* TODO abort? */
@@ -6106,7 +6427,6 @@ void __kmp_internal_end_library(int gtid_req) {
   }
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
-
   /* find out who we are and what we should do */
   {
     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
@@ -6129,6 +6449,7 @@ void __kmp_internal_end_library(int gtid_req) {
       if (__kmp_root[gtid]->r.r_active) {
         __kmp_global.g.g_abort = -1;
         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
+        __kmp_unregister_library();
         KA_TRACE(10,
                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
                   gtid));
@@ -6148,6 +6469,10 @@ void __kmp_internal_end_library(int gtid_req) {
       if (__kmp_debug_buf)
         __kmp_dump_debug_buffer();
 #endif
+      // added unregister library call here when we switch to shm linux
+      // if we don't, it will leave lots of files in /dev/shm
+      // cleanup shared memory file before exiting.
+      __kmp_unregister_library();
       return;
     }
   }
@@ -6192,6 +6517,11 @@ void __kmp_internal_end_library(int gtid_req) {
 
   __kmp_fini_allocator();
 
+#if KMP_USE_ABT
+  /* last cleanup part */
+  __kmp_abt_global_destroy();
+#endif
+
 } // __kmp_internal_end_library
 
 void __kmp_internal_end_thread(int gtid_req) {
@@ -6329,11 +6659,17 @@ static char *__kmp_registration_str = NULL;
 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
 
 static inline char *__kmp_reg_status_name() {
-  /* On RHEL 3u5 if linked statically, getpid() returns different values in
-     each thread. If registration and unregistration go in different threads
-     (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
-     env var can not be found, because the name will contain different pid. */
+/* On RHEL 3u5 if linked statically, getpid() returns different values in
+   each thread. If registration and unregistration go in different threads
+   (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
+   env var can not be found, because the name will contain different pid. */
+// macOS* complains about name being too long with additional getuid()
+#if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
+  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
+                          (int)getuid());
+#else
   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
+#endif
 } // __kmp_reg_status_get
 
 void __kmp_register_library_startup(void) {
@@ -6360,16 +6696,61 @@ void __kmp_register_library_startup(void) {
 
     char *value = NULL; // Actual value of the environment variable.
 
+#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+    char *shm_name = __kmp_str_format("/%s", name);
+    int shm_preexist = 0;
+    char *data1;
+    int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
+    if ((fd1 == -1) && (errno == EEXIST)) {
+      // file didn't open because it already exists.
+      // try opening existing file
+      fd1 = shm_open(shm_name, O_RDWR, 0666);
+      if (fd1 == -1) { // file didn't open
+        // error out here
+        __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
+                    __kmp_msg_null);
+      } else {
+        // able to open existing file
+        shm_preexist = 1;
+      }
+    } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
+      // already exists.
+      // error out here.
+      __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
+                  __kmp_msg_null);
+    }
+    if (shm_preexist == 0) {
+      // we created SHM now set size
+      if (ftruncate(fd1, SHM_SIZE) == -1) {
+        // error occured setting size;
+        __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
+                    KMP_ERR(errno), __kmp_msg_null);
+      }
+    }
+    data1 =
+        (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
+    if (data1 == MAP_FAILED) {
+      // failed to map shared memory
+      __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
+                  __kmp_msg_null);
+    }
+    if (shm_preexist == 0) { // set data to SHM, set value
+      KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
+    }
+    // Read value from either what we just wrote or existing file.
+    value = __kmp_str_format("%s", data1); // read value from SHM
+    munmap(data1, SHM_SIZE);
+    close(fd1);
+#else // Windows and unix with static library
     // Set environment variable, but do not overwrite if it is exist.
     __kmp_env_set(name, __kmp_registration_str, 0);
-    // Check the variable is written.
+    // read value to see if it got set
     value = __kmp_env_get(name);
-    if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
+#endif
 
+    if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
       done = 1; // Ok, environment variable set successfully, exit the loop.
-
     } else {
-
       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
       // Check whether it alive or dead.
       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
@@ -6419,14 +6800,23 @@ void __kmp_register_library_startup(void) {
         done = 1; // Exit the loop.
       } break;
       case 2: { // Neighbor is dead.
+
+#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+        // close shared memory.
+        shm_unlink(shm_name); // this removes file in /dev/shm
+#else
         // Clear the variable and try to register library again.
         __kmp_env_unset(name);
+#endif
       } break;
       default: { KMP_DEBUG_ASSERT(0); } break;
       }
     }
     KMP_INTERNAL_FREE((void *)value);
-  }
+#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+    KMP_INTERNAL_FREE((void *)shm_name);
+#endif
+  } // while
   KMP_INTERNAL_FREE((void *)name);
 
 } // func __kmp_register_library_startup
@@ -6434,15 +6824,40 @@ void __kmp_register_library_startup(void) {
 void __kmp_unregister_library(void) {
 
   char *name = __kmp_reg_status_name();
-  char *value = __kmp_env_get(name);
+  char *value = NULL;
+
+#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+  char *shm_name = __kmp_str_format("/%s", name);
+  int fd1 = shm_open(shm_name, O_RDONLY, 0666);
+  if (fd1 == -1) {
+    // file did not open. return.
+    return;
+  }
+  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
+  if (data1 != MAP_FAILED) {
+    value = __kmp_str_format("%s", data1); // read value from SHM
+    munmap(data1, SHM_SIZE);
+  }
+  close(fd1);
+#else
+  value = __kmp_env_get(name);
+#endif
 
   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
-    // Ok, this is our variable. Delete it.
+//  Ok, this is our variable. Delete it.
+#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+    shm_unlink(shm_name); // this removes file in /dev/shm
+#else
     __kmp_env_unset(name);
+#endif
   }
 
+#if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
+  KMP_INTERNAL_FREE(shm_name);
+#endif
+
   KMP_INTERNAL_FREE(__kmp_registration_str);
   KMP_INTERNAL_FREE(value);
   KMP_INTERNAL_FREE(name);
@@ -6473,9 +6888,51 @@ static void __kmp_check_mic_type() {
 
 #endif /* KMP_MIC_SUPPORTED */
 
+#if KMP_HAVE_UMWAIT
+static void __kmp_user_level_mwait_init() {
+  struct kmp_cpuid buf;
+  __kmp_x86_cpuid(7, 0, &buf);
+  __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
+  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
+                __kmp_umwait_enabled));
+}
+#elif KMP_HAVE_MWAIT
+#ifndef AT_INTELPHIUSERMWAIT
+// Spurious, non-existent value that should always fail to return anything.
+// Will be replaced with the correct value when we know that.
+#define AT_INTELPHIUSERMWAIT 10000
+#endif
+// getauxval() function is available in RHEL7 and SLES12. If a system with an
+// earlier OS is used to build the RTL, we'll use the following internal
+// function when the entry is not found.
+unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
+unsigned long getauxval(unsigned long) { return 0; }
+
+static void __kmp_user_level_mwait_init() {
+  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
+  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
+  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
+  // KMP_USER_LEVEL_MWAIT was set to TRUE.
+  if (__kmp_mic_type == mic3) {
+    unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
+    if ((res & 0x1) || __kmp_user_level_mwait) {
+      __kmp_mwait_enabled = TRUE;
+      if (__kmp_user_level_mwait) {
+        KMP_INFORM(EnvMwaitWarn);
+      }
+    } else {
+      __kmp_mwait_enabled = FALSE;
+    }
+  }
+  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
+                "__kmp_mwait_enabled = %d\n",
+                __kmp_mic_type, __kmp_mwait_enabled));
+}
+#endif /* KMP_HAVE_UMWAIT */
+
 static void __kmp_do_serial_initialize(void) {
   int i, gtid;
-  int size;
+  size_t size;
 
   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
 
@@ -6489,6 +6946,11 @@ static void __kmp_do_serial_initialize(void) {
   ompt_pre_init();
 #endif
 
+#if KMP_USE_ABT
+  if (!__kmp_abt_init_global)
+    __kmp_abt_global_initialize();
+#endif
+
   __kmp_validate_locks();
 
   /* Initialize internal memory allocator */
@@ -6533,6 +6995,11 @@ static void __kmp_do_serial_initialize(void) {
   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
+#if KMP_REMOVE_FORKJOIN_LOCK
+  __kmp_init_bootstrap_lock(&__kmp_threads_lock);
+  __kmp_init_bootstrap_lock(&__kmp_team_pool_lock);
+  __kmp_init_bootstrap_lock(&__kmp_thread_pool_lock);
+#endif
   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
 #if KMP_USE_MONITOR
   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
@@ -6647,6 +7114,9 @@ static void __kmp_do_serial_initialize(void) {
 
   __kmp_env_initialize(NULL);
 
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+  __kmp_user_level_mwait_init();
+#endif
 // Print all messages in message catalog for testing purposes.
 #ifdef KMP_DEBUG
   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
@@ -6919,7 +7389,7 @@ void __kmp_parallel_initialize(void) {
   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
   KMP_ASSERT(KMP_UBER_GTID(gtid));
 
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#if (KMP_ARCH_X86 || KMP_ARCH_X86_64) && !KMP_USE_ABT
   // Save the FP control regs.
   // Worker threads will set theirs to these values at thread startup.
   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
@@ -6969,9 +7439,9 @@ void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
 
   /* none of the threads have encountered any constructs, yet. */
   this_thr->th.th_local.this_construct = 0;
-#if KMP_CACHE_MANAGE
+#if KMP_CACHE_MANAGE && !KMP_USE_ABT
   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
-#endif /* KMP_CACHE_MANAGE */
+#endif /* KMP_CACHE_MANAGE  && !KMP_USE_ABT */
   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
   KMP_DEBUG_ASSERT(dispatch);
   KMP_DEBUG_ASSERT(team->t.t_dispatch);
@@ -7158,7 +7628,7 @@ int __kmp_invoke_teams_master(int gtid) {
 
 /* this sets the requested number of threads for the next parallel region
    encountered by this team. since this should be enclosed in the forkjoin
-   critical section it should avoid race conditions with assymmetrical nested
+   critical section it should avoid race conditions with asymmetrical nested
    parallelism */
 
 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
@@ -7279,8 +7749,22 @@ void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
   }
 #endif /* KMP_DEBUG */
 
+#if KMP_USE_ABT
+  // Set up th_task_team.
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    // Originally, it is done in fork_barrier().
+    // Only master thread can execute it.
+    __kmp_task_team_setup(this_thr, team, 0);
+    this_thr->th.th_task_team = team->t.t_task_team[this_thr->th.th_task_state];
+  }
+
+  /* Create worker threads here */
+  __kmp_abt_create_workers(team);
+  KA_TRACE(20, ("__kmp_internal_fork: after __kmp_abt_create_workers"));
+#else // KMP_USE_ABT
   /* release the worker threads so they may begin working */
   __kmp_fork_barrier(gtid, 0);
+#endif // !KMP_USE_ABT
 }
 
 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
@@ -7308,7 +7792,27 @@ void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
 #endif /* KMP_DEBUG */
 
+#if KMP_USE_ABT
+  {
+    /* The master thread executes the remaining tasks*/
+    __kmp_abt_wait_child_tasks(this_thr, true, FALSE);
+
+    kmp_taskdata_t *taskdata = this_thr->th.th_current_task;
+
+    __kmp_abt_release_info(this_thr);
+
+    /* Join Argobots ULTs here */
+    __kmp_abt_join_workers(team);
+    KA_TRACE(20, ("__kmp_internal_join: after __kmp_abt_join_workers"));
+    // We don't need atomic operations to get thread info if it joined an
+    // outermost parallel region.
+    __kmp_abt_acquire_info_for_task(this_thr, taskdata, team,
+                                    team->t.t_level != 1);
+  }
+#else // KMP_USE_ABT
   __kmp_join_barrier(gtid); /* wait for everyone */
+#endif // !KMP_USE_ABT
+
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled &&
       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
@@ -7376,6 +7880,7 @@ static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
 // Perform an automatic adjustment to the number of
 // threads used by the next parallel region.
 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
+#if !KMP_USE_ABT
   int retval;
   int pool_active;
   int hot_team_active;
@@ -7447,6 +7952,12 @@ static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
 
   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
   return retval;
+#else
+  // In BOLT, most schedulers (=kernel threads) are busy-wait, so load balancing
+  // above does not work. Since threads are very lightweight, BOLT always
+  // creates as many as requested.
+  return set_nproc;
+#endif // !KMP_USE_ABT
 } // __kmp_load_balance_nproc()
 
 #endif /* USE_LOAD_BALANCE */
@@ -7601,7 +8112,7 @@ void __kmp_user_set_library(enum library_type arg) {
 
   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
                 library_serial));
-  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
+  if (root->r.r_active) { /* Must be called in serial section of top-level
                                   thread */
     KMP_WARNING(SetLibraryIncorrectCall);
     return;
@@ -7869,7 +8380,7 @@ static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
     const char *long_name = __kmp_affinity_format_table[i].long_name;
     char field_format = __kmp_affinity_format_table[i].field_format;
     if (parse_long_name) {
-      int length = KMP_STRLEN(long_name);
+      size_t length = KMP_STRLEN(long_name);
       if (strncmp(*ptr, long_name, length) == 0) {
         found_valid_name = true;
         (*ptr) += length; // skip the long name
@@ -8019,7 +8530,7 @@ void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
 #if KMP_USE_MONITOR
   int bt_intervals;
 #endif
-  int bt_set;
+  kmp_int8 bt_set;
 
   __kmp_save_internal_controls(thread);
 
@@ -8058,7 +8569,7 @@ void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
 #endif
 }
 
-void __kmp_aux_set_defaults(char const *str, int len) {
+void __kmp_aux_set_defaults(char const *str, size_t len) {
   if (!__kmp_init_serial) {
     __kmp_serial_initialize();
   }
@@ -8123,7 +8634,11 @@ __kmp_determine_reduction_method(
       teamsize_cutoff = 8;
     }
 #endif
+#if KMP_USE_ABT
+    int tree_available = 0;
+#else
     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
+#endif
     if (tree_available) {
       if (team_size <= teamsize_cutoff) {
         if (atomic_available) {
@@ -8179,6 +8694,9 @@ __kmp_determine_reduction_method(
   // method and stay with the unsynchronized method (empty_reduce_block)
   if (__kmp_force_reduction_method != reduction_method_not_defined &&
       team_size != 1) {
+#if KMP_USE_ABT
+      KMP_ASSERT(0); // "unsupported method specified"
+#else // !KMP_USE_ABT
 
     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
 
@@ -8214,6 +8732,7 @@ __kmp_determine_reduction_method(
     }
 
     retval = forced_retval;
+#endif // !KMP_USE_ABT
   }
 
   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
@@ -8223,7 +8742,6 @@ __kmp_determine_reduction_method(
 
   return (retval);
 }
-
 // this function is for testing set/get/determine reduce method
 kmp_int32 __kmp_get_reduce_method(void) {
   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
@@ -8245,10 +8763,12 @@ void __kmp_resume_if_soft_paused() {
   if (__kmp_pause_status == kmp_soft_paused) {
     __kmp_pause_status = kmp_not_paused;
 
+#if !KMP_USE_ABT
     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
       kmp_info_t *thread = __kmp_threads[gtid];
       if (thread) { // Wake it if sleeping
-        kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
+        kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
+                         thread);
         if (fl.is_sleeping())
           fl.resume(gtid);
         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
@@ -8266,6 +8786,7 @@ void __kmp_resume_if_soft_paused() {
         }
       }
     }
+#endif // !KMP_USE_ABT
   }
 }
 
@@ -8303,3 +8824,12 @@ int __kmp_pause_resource(kmp_pause_status_t level) {
     return 1;
   }
 }
+
+
+void __kmp_omp_display_env(int verbose) {
+  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+  if (__kmp_init_serial == 0)
+    __kmp_do_serial_initialize();
+  __kmp_display_env_impl(!verbose, verbose);
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+}
diff --git a/runtime/src/kmp_safe_c_api.h b/runtime/src/kmp_safe_c_api.h
index f839f734a..abc0a16f8 100644
--- a/runtime/src/kmp_safe_c_api.h
+++ b/runtime/src/kmp_safe_c_api.h
@@ -64,11 +64,9 @@ static inline void __kmp_strncpy_truncate(char *buffer, size_t buf_size,
                                           char const *src, size_t src_size) {
   if (src_size >= buf_size) {
     src_size = buf_size - 1;
-    KMP_STRNCPY_S(buffer, buf_size, src, src_size);
-    buffer[buf_size - 1] = '\0';
-  } else {
-    KMP_STRNCPY_S(buffer, buf_size, src, src_size);
   }
+  KMP_STRNCPY_S(buffer, buf_size, src, src_size);
+  buffer[src_size] = '\0';
 }
 
 #endif // KMP_SAFE_C_API_H
diff --git a/runtime/src/kmp_sched.cpp b/runtime/src/kmp_sched.cpp
index 17c149806..2d8f644c8 100644
--- a/runtime/src/kmp_sched.cpp
+++ b/runtime/src/kmp_sched.cpp
@@ -61,6 +61,12 @@ char const *traits_t<long>::spec = "ld";
 #define KMP_STATS_LOOP_END(stat) /* Nothing */
 #endif
 
+static ident_t loc_stub = {0, KMP_IDENT_KMPC, 0, 0, ";unknown;unknown;0;0;;"};
+static inline void check_loc(ident_t *&loc) {
+  if (loc == NULL)
+    loc = &loc_stub; // may need to report location info to ittnotify
+}
+
 template <typename T>
 static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
                                   kmp_int32 schedtype, kmp_int32 *plastiter,
@@ -85,6 +91,7 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
   kmp_uint32 nth;
   UT trip_count;
   kmp_team_t *team;
+  __kmp_assert_valid_gtid(gtid);
   kmp_info_t *th = __kmp_threads[gtid];
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
@@ -381,6 +388,7 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
       __kmp_forkjoin_frames_mode == 3 && th->th.th_teams_microtask == NULL &&
       team->t.t_active_level == 1) {
     kmp_uint64 cur_chunk = chunk;
+    check_loc(loc);
     // Calculate chunk in case it was not specified; it is specified for
     // kmp_sch_static_chunked
     if (schedtype == kmp_sch_static) {
@@ -438,6 +446,7 @@ static void __kmp_dist_for_static_init(ident_t *loc, kmp_int32 gtid,
 
   KMP_DEBUG_ASSERT(plastiter && plower && pupper && pupperDist && pstride);
   KE_TRACE(10, ("__kmpc_dist_for_static_init called (%d)\n", gtid));
+  __kmp_assert_valid_gtid(gtid);
 #ifdef KMP_DEBUG
   {
     char *buff;
@@ -667,7 +676,7 @@ static void __kmp_team_static_init(ident_t *loc, kmp_int32 gtid,
   // stride for next chunks calculation.
   // Last iteration flag set for the team that will execute
   // the last iteration of the loop.
-  // The routine is called for dist_schedue(static,chunk) only.
+  // The routine is called for dist_schedule(static,chunk) only.
   typedef typename traits_t<T>::unsigned_t UT;
   typedef typename traits_t<T>::signed_t ST;
   kmp_uint32 team_id;
@@ -681,6 +690,7 @@ static void __kmp_team_static_init(ident_t *loc, kmp_int32 gtid,
 
   KMP_DEBUG_ASSERT(p_last && p_lb && p_ub && p_st);
   KE_TRACE(10, ("__kmp_team_static_init called (%d)\n", gtid));
+  __kmp_assert_valid_gtid(gtid);
 #ifdef KMP_DEBUG
   {
     char *buff;
diff --git a/runtime/src/kmp_settings.cpp b/runtime/src/kmp_settings.cpp
index 692ca26d0..efabac858 100644
--- a/runtime/src/kmp_settings.cpp
+++ b/runtime/src/kmp_settings.cpp
@@ -354,17 +354,20 @@ static void __kmp_stg_parse_size(char const *name, char const *value,
   }
 } // __kmp_stg_parse_size
 
+
+#if KMP_AFFINITY_SUPPORTED || OMPT_SUPPORT
 static void __kmp_stg_parse_str(char const *name, char const *value,
                                 char **out) {
   __kmp_str_free(out);
   *out = __kmp_str_format("%s", value);
 } // __kmp_stg_parse_str
+#endif
 
 static void __kmp_stg_parse_int(
     char const
         *name, // I: Name of environment variable (used in warning messages).
     char const *value, // I: Value of environment variable to parse.
-    int min, // I: Miminal allowed value.
+    int min, // I: Minimum allowed value.
     int max, // I: Maximum allowed value.
     int *out // O: Output (parsed) value.
     ) {
@@ -397,7 +400,7 @@ static void __kmp_stg_parse_int(
     KMP_INFORM(Using_uint64_Value, name, buf.str);
     __kmp_str_buf_free(&buf);
   }
-  *out = uint;
+  __kmp_type_convert(uint, out);
 } // __kmp_stg_parse_int
 
 #if KMP_DEBUG_ADAPTIVE_LOCKS
@@ -498,10 +501,10 @@ int __kmp_initial_threads_capacity(int req_nproc) {
 
   /* MIN( MAX( 32, 4 * $OMP_NUM_THREADS, 4 * omp_get_num_procs() ),
    * __kmp_max_nth) */
-  if (nth < (4 * req_nproc))
-    nth = (4 * req_nproc);
-  if (nth < (4 * __kmp_xproc))
-    nth = (4 * __kmp_xproc);
+  if (nth < (512 * req_nproc))
+    nth = (512 * req_nproc);
+  if (nth < (512 * __kmp_xproc))
+    nth = (512 * __kmp_xproc);
 
   if (nth > __kmp_max_nth)
     nth = __kmp_max_nth;
@@ -549,7 +552,6 @@ static void __kmp_stg_print_int(kmp_str_buf_t *buffer, char const *name,
   }
 } // __kmp_stg_print_int
 
-#if USE_ITT_BUILD && USE_ITT_NOTIFY
 static void __kmp_stg_print_uint64(kmp_str_buf_t *buffer, char const *name,
                                    kmp_uint64 value) {
   if (__kmp_env_format) {
@@ -558,7 +560,6 @@ static void __kmp_stg_print_uint64(kmp_str_buf_t *buffer, char const *name,
     __kmp_str_buf_print(buffer, "   %s=%" KMP_UINT64_SPEC "\n", name, value);
   }
 } // __kmp_stg_print_uint64
-#endif
 
 static void __kmp_stg_print_str(kmp_str_buf_t *buffer, char const *name,
                                 char const *value) {
@@ -1034,7 +1035,7 @@ static void __kmp_parse_nested_num_threads(const char *var, const char *env,
     }
     // The next character is ','
     if (*next == ',') {
-      // ',' is the fisrt character
+      // ',' is the first character
       if (total == 0 || prev_comma) {
         total++;
       }
@@ -1225,7 +1226,7 @@ static void __kmp_stg_parse_max_active_levels(char const *name,
       msg = KMP_I18N_STR(ValueTooLarge);
       KMP_WARNING(ParseSizeIntWarn, name, value, msg);
     } else { // valid setting
-      __kmp_dflt_max_active_levels = tmp_dflt;
+      __kmp_type_convert(tmp_dflt, &(__kmp_dflt_max_active_levels));
       __kmp_dflt_max_active_levels_set = true;
     }
   }
@@ -1305,7 +1306,7 @@ static void __kmp_stg_print_max_task_priority(kmp_str_buf_t *buffer,
 } // __kmp_stg_print_max_task_priority
 
 // KMP_TASKLOOP_MIN_TASKS
-// taskloop threashold to switch from recursive to linear tasks creation
+// taskloop threshold to switch from recursive to linear tasks creation
 static void __kmp_stg_parse_taskloop_min_tasks(char const *name,
                                                char const *value, void *data) {
   int tmp;
@@ -1315,7 +1316,7 @@ static void __kmp_stg_parse_taskloop_min_tasks(char const *name,
 
 static void __kmp_stg_print_taskloop_min_tasks(kmp_str_buf_t *buffer,
                                                char const *name, void *data) {
-  __kmp_stg_print_int(buffer, name, __kmp_taskloop_min_tasks);
+  __kmp_stg_print_uint64(buffer, name, __kmp_taskloop_min_tasks);
 } // __kmp_stg_print_taskloop_min_tasks
 
 // -----------------------------------------------------------------------------
@@ -1986,7 +1987,7 @@ static int __kmp_parse_affinity_proc_id_list(const char *var, const char *env,
   *nextEnv = next;
 
   {
-    int len = next - env;
+    ptrdiff_t len = next - env;
     char *retlist = (char *)__kmp_allocate((len + 1) * sizeof(char));
     KMP_MEMCPY_S(retlist, (len + 1) * sizeof(char), env, len * sizeof(char));
     retlist[len] = '\0';
@@ -2041,7 +2042,7 @@ static void __kmp_parse_affinity_env(char const *name, char const *value,
 // If we see a parse error, emit a warning and scan to the next ",".
 //
 // FIXME - there's got to be a better way to print an error
-// message, hopefully without overwritting peices of buf.
+// message, hopefully without overwriting peices of buf.
 #define EMIT_WARN(skip, errlist)                                               \
   {                                                                            \
     char ch;                                                                   \
@@ -2769,7 +2770,7 @@ static int __kmp_parse_place_list(const char *var, const char *env,
   }
 
   {
-    int len = scan - env;
+    ptrdiff_t len = scan - env;
     char *retlist = (char *)__kmp_allocate((len + 1) * sizeof(char));
     KMP_MEMCPY_S(retlist, (len + 1) * sizeof(char), env, len * sizeof(char));
     retlist[len] = '\0';
@@ -3166,6 +3167,13 @@ static void __kmp_stg_parse_proc_bind(char const *name, char const *value,
         buf = next;
         SKIP_WS(buf);
         bind = proc_bind_spread;
+#if KMP_USE_ABT
+      } else if ((num == (int)proc_bind_unset) ||
+                 __kmp_match_str("unset", buf, &next)) {
+        buf = next;
+        SKIP_WS(buf);
+        bind = proc_bind_unset;
+#endif
       } else {
         KMP_WARNING(StgInvalidValue, name, value);
         __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
@@ -3242,6 +3250,12 @@ static void __kmp_stg_print_proc_bind(kmp_str_buf_t *buffer, char const *name,
       case proc_bind_default:
         __kmp_str_buf_print(buffer, "default");
         break;
+
+#if KMP_USE_ABT
+      case proc_bind_unset:
+        __kmp_str_buf_print(buffer, "unset");
+        break;
+#endif
       }
       if (i < nelem - 1) {
         __kmp_str_buf_print(buffer, ",");
@@ -3274,6 +3288,7 @@ static void __kmp_stg_print_affinity_format(kmp_str_buf_t *buffer,
   }
   __kmp_str_buf_print(buffer, "%s'\n", __kmp_affinity_format);
 }
+
 // OMP_ALLOCATOR sets default allocator
 static void __kmp_stg_parse_allocator(char const *name, char const *value,
                                       void *data) {
@@ -3291,104 +3306,65 @@ static void __kmp_stg_parse_allocator(char const *name, char const *value,
   */
   const char *buf = value;
   const char *next;
-  int num;
   SKIP_WS(buf);
-  if ((*buf > '0') && (*buf < '9')) {
-    next = buf;
-    SKIP_DIGITS(next);
-    num = __kmp_str_to_int(buf, *next);
-    KMP_ASSERT(num > 0);
-    switch (num) {
-    case 4:
+  next = buf;
+  // check HBW first as the only non-default supported
+  if (__kmp_match_str("omp_high_bw_mem_alloc", buf, &next) ||
+      __kmp_match_str("4", buf, &next)) {
+    SKIP_WS(next);
+    if (*next == '\0') {
       if (__kmp_memkind_available) {
         __kmp_def_allocator = omp_high_bw_mem_alloc;
+        return;
       } else {
-        __kmp_msg(kmp_ms_warning,
-                  KMP_MSG(OmpNoAllocator, "omp_high_bw_mem_alloc"),
-                  __kmp_msg_null);
-        __kmp_def_allocator = omp_default_mem_alloc;
+        KMP_WARNING(OmpNoAllocator, "omp_high_bw_mem_alloc");
       }
-      break;
-    case 1:
-      __kmp_def_allocator = omp_default_mem_alloc;
-      break;
-    case 2:
-      __kmp_msg(kmp_ms_warning,
-                KMP_MSG(OmpNoAllocator, "omp_large_cap_mem_alloc"),
-                __kmp_msg_null);
-      __kmp_def_allocator = omp_default_mem_alloc;
-      break;
-    case 3:
-      __kmp_msg(kmp_ms_warning, KMP_MSG(OmpNoAllocator, "omp_const_mem_alloc"),
-                __kmp_msg_null);
-      __kmp_def_allocator = omp_default_mem_alloc;
-      break;
-    case 5:
-      __kmp_msg(kmp_ms_warning,
-                KMP_MSG(OmpNoAllocator, "omp_low_lat_mem_alloc"),
-                __kmp_msg_null);
-      __kmp_def_allocator = omp_default_mem_alloc;
-      break;
-    case 6:
-      __kmp_msg(kmp_ms_warning, KMP_MSG(OmpNoAllocator, "omp_cgroup_mem_alloc"),
-                __kmp_msg_null);
-      __kmp_def_allocator = omp_default_mem_alloc;
-      break;
-    case 7:
-      __kmp_msg(kmp_ms_warning, KMP_MSG(OmpNoAllocator, "omp_pteam_mem_alloc"),
-                __kmp_msg_null);
-      __kmp_def_allocator = omp_default_mem_alloc;
-      break;
-    case 8:
-      __kmp_msg(kmp_ms_warning, KMP_MSG(OmpNoAllocator, "omp_thread_mem_alloc"),
-                __kmp_msg_null);
-      __kmp_def_allocator = omp_default_mem_alloc;
-      break;
     }
-    return;
-  }
-  next = buf;
-  if (__kmp_match_str("omp_high_bw_mem_alloc", buf, &next)) {
-    if (__kmp_memkind_available) {
-      __kmp_def_allocator = omp_high_bw_mem_alloc;
-    } else {
-      __kmp_msg(kmp_ms_warning,
-                KMP_MSG(OmpNoAllocator, "omp_high_bw_mem_alloc"),
-                __kmp_msg_null);
-      __kmp_def_allocator = omp_default_mem_alloc;
+  } else if (__kmp_match_str("omp_default_mem_alloc", buf, &next) ||
+             __kmp_match_str("1", buf, &next)) {
+    // default requested
+    SKIP_WS(next);
+  } else if (__kmp_match_str("omp_large_cap_mem_alloc", buf, &next) ||
+             __kmp_match_str("2", buf, &next)) {
+    SKIP_WS(next);
+    if (*next == '\0') {
+      KMP_WARNING(OmpNoAllocator, "omp_large_cap_mem_alloc");
+    }
+  } else if (__kmp_match_str("omp_const_mem_alloc", buf, &next) ||
+             __kmp_match_str("3", buf, &next)) {
+    SKIP_WS(next);
+    if (*next == '\0') {
+      KMP_WARNING(OmpNoAllocator, "omp_const_mem_alloc");
+    }
+  } else if (__kmp_match_str("omp_low_lat_mem_alloc", buf, &next) ||
+             __kmp_match_str("5", buf, &next)) {
+    SKIP_WS(next);
+    if (*next == '\0') {
+      KMP_WARNING(OmpNoAllocator, "omp_low_lat_mem_alloc");
+    }
+  } else if (__kmp_match_str("omp_cgroup_mem_alloc", buf, &next) ||
+             __kmp_match_str("6", buf, &next)) {
+    SKIP_WS(next);
+    if (*next == '\0') {
+      KMP_WARNING(OmpNoAllocator, "omp_cgroup_mem_alloc");
+    }
+  } else if (__kmp_match_str("omp_pteam_mem_alloc", buf, &next) ||
+             __kmp_match_str("7", buf, &next)) {
+    SKIP_WS(next);
+    if (*next == '\0') {
+      KMP_WARNING(OmpNoAllocator, "omp_pteam_mem_alloc");
+    }
+  } else if (__kmp_match_str("omp_thread_mem_alloc", buf, &next) ||
+             __kmp_match_str("8", buf, &next)) {
+    SKIP_WS(next);
+    if (*next == '\0') {
+      KMP_WARNING(OmpNoAllocator, "omp_thread_mem_alloc");
     }
-  } else if (__kmp_match_str("omp_default_mem_alloc", buf, &next)) {
-    __kmp_def_allocator = omp_default_mem_alloc;
-  } else if (__kmp_match_str("omp_large_cap_mem_alloc", buf, &next)) {
-    __kmp_msg(kmp_ms_warning,
-              KMP_MSG(OmpNoAllocator, "omp_large_cap_mem_alloc"),
-              __kmp_msg_null);
-    __kmp_def_allocator = omp_default_mem_alloc;
-  } else if (__kmp_match_str("omp_const_mem_alloc", buf, &next)) {
-    __kmp_msg(kmp_ms_warning, KMP_MSG(OmpNoAllocator, "omp_const_mem_alloc"),
-              __kmp_msg_null);
-    __kmp_def_allocator = omp_default_mem_alloc;
-  } else if (__kmp_match_str("omp_low_lat_mem_alloc", buf, &next)) {
-    __kmp_msg(kmp_ms_warning, KMP_MSG(OmpNoAllocator, "omp_low_lat_mem_alloc"),
-              __kmp_msg_null);
-    __kmp_def_allocator = omp_default_mem_alloc;
-  } else if (__kmp_match_str("omp_cgroup_mem_alloc", buf, &next)) {
-    __kmp_msg(kmp_ms_warning, KMP_MSG(OmpNoAllocator, "omp_cgroup_mem_alloc"),
-              __kmp_msg_null);
-    __kmp_def_allocator = omp_default_mem_alloc;
-  } else if (__kmp_match_str("omp_pteam_mem_alloc", buf, &next)) {
-    __kmp_msg(kmp_ms_warning, KMP_MSG(OmpNoAllocator, "omp_pteam_mem_alloc"),
-              __kmp_msg_null);
-    __kmp_def_allocator = omp_default_mem_alloc;
-  } else if (__kmp_match_str("omp_thread_mem_alloc", buf, &next)) {
-    __kmp_msg(kmp_ms_warning, KMP_MSG(OmpNoAllocator, "omp_thread_mem_alloc"),
-              __kmp_msg_null);
-    __kmp_def_allocator = omp_default_mem_alloc;
   }
-  buf = next;
-  SKIP_WS(buf);
-  if (*buf != '\0') {
-    KMP_WARNING(ParseExtraCharsWarn, name, buf);
+  __kmp_def_allocator = omp_default_mem_alloc;
+  if (next == buf || *next != '\0') {
+    // either no match or extra symbols present after the matched token
+    KMP_WARNING(StgInvalidValue, name, value);
   }
 }
 
@@ -4050,6 +4026,14 @@ static void __kmp_stg_parse_lock_kind(char const *name, char const *value,
     return;
   }
 
+#if KMP_USE_ABT
+  if (true) {
+      // BOLT only supports a queuing lock.
+      KMP_WARNING(LockTypeNotSupported, name, value);
+      __kmp_user_lock_kind = lk_queuing;
+      KMP_STORE_LOCK_SEQ(queuing);
+  } else
+#endif
   if (__kmp_str_match("tas", 2, value) ||
       __kmp_str_match("test and set", 2, value) ||
       __kmp_str_match("test_and_set", 2, value) ||
@@ -4102,15 +4086,24 @@ static void __kmp_stg_parse_lock_kind(char const *name, char const *value,
   }
 #endif // KMP_USE_ADAPTIVE_LOCKS
 #if KMP_USE_DYNAMIC_LOCK && KMP_USE_TSX
-  else if (__kmp_str_match("rtm", 1, value)) {
+  else if (__kmp_str_match("rtm_queuing", 1, value)) {
     if (__kmp_cpuinfo.rtm) {
-      __kmp_user_lock_kind = lk_rtm;
-      KMP_STORE_LOCK_SEQ(rtm);
+      __kmp_user_lock_kind = lk_rtm_queuing;
+      KMP_STORE_LOCK_SEQ(rtm_queuing);
     } else {
       KMP_WARNING(AdaptiveNotSupported, name, value);
       __kmp_user_lock_kind = lk_queuing;
       KMP_STORE_LOCK_SEQ(queuing);
     }
+  } else if (__kmp_str_match("rtm_spin", 1, value)) {
+    if (__kmp_cpuinfo.rtm) {
+      __kmp_user_lock_kind = lk_rtm_spin;
+      KMP_STORE_LOCK_SEQ(rtm_spin);
+    } else {
+      KMP_WARNING(AdaptiveNotSupported, name, value);
+      __kmp_user_lock_kind = lk_tas;
+      KMP_STORE_LOCK_SEQ(queuing);
+    }
   } else if (__kmp_str_match("hle", 1, value)) {
     __kmp_user_lock_kind = lk_hle;
     KMP_STORE_LOCK_SEQ(hle);
@@ -4141,8 +4134,12 @@ static void __kmp_stg_print_lock_kind(kmp_str_buf_t *buffer, char const *name,
 #endif
 
 #if KMP_USE_DYNAMIC_LOCK && KMP_USE_TSX
-  case lk_rtm:
-    value = "rtm";
+  case lk_rtm_queuing:
+    value = "rtm_queuing";
+    break;
+
+  case lk_rtm_spin:
+    value = "rtm_spin";
     break;
 
   case lk_hle:
@@ -4205,7 +4202,7 @@ static void __kmp_stg_parse_spin_backoff_params(const char *name,
     }
     // The next character is ','
     if (*next == ',') {
-      // ',' is the fisrt character
+      // ',' is the first character
       if (total == 0 || prev_comma) {
         total++;
       }
@@ -4304,7 +4301,7 @@ static void __kmp_stg_parse_adaptive_lock_props(const char *name,
     }
     // The next character is ','
     if (*next == ',') {
-      // ',' is the fisrt character
+      // ',' is the first character
       if (total == 0 || prev_comma) {
         total++;
       }
@@ -4395,7 +4392,7 @@ static void __kmp_stg_print_speculative_statsfile(kmp_str_buf_t *buffer,
 // -----------------------------------------------------------------------------
 // KMP_HW_SUBSET (was KMP_PLACE_THREADS)
 
-// The longest observable sequense of items is
+// The longest observable sequence of items is
 // Socket-Node-Tile-Core-Thread
 // So, let's limit to 5 levels for now
 // The input string is usually short enough, let's use 512 limit for now
@@ -4425,7 +4422,7 @@ static void __kmp_stg_parse_hw_subset(char const *name, char const *value,
       if (len == 0 && *pos == ':') {
         __kmp_hws_abs_flag = 1; // if the first symbol is ":", skip it
       } else {
-        input[len] = toupper(*pos);
+        input[len] = (char)(toupper(*pos));
         if (input[len] == 'X')
           input[len] = ','; // unify delimiters of levels
         if (input[len] == 'O' && strchr(digits, *(pos + 1)))
@@ -4621,6 +4618,35 @@ static void __kmp_stg_print_task_throttling(kmp_str_buf_t *buffer,
   __kmp_stg_print_bool(buffer, name, __kmp_enable_task_throttling);
 } // __kmp_stg_print_task_throttling
 
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+// -----------------------------------------------------------------------------
+// KMP_USER_LEVEL_MWAIT
+
+static void __kmp_stg_parse_user_level_mwait(char const *name,
+                                             char const *value, void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_user_level_mwait);
+} // __kmp_stg_parse_user_level_mwait
+
+static void __kmp_stg_print_user_level_mwait(kmp_str_buf_t *buffer,
+                                             char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_user_level_mwait);
+} // __kmp_stg_print_user_level_mwait
+
+// -----------------------------------------------------------------------------
+// KMP_MWAIT_HINTS
+
+static void __kmp_stg_parse_mwait_hints(char const *name, char const *value,
+                                        void *data) {
+  __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_mwait_hints);
+} // __kmp_stg_parse_mwait_hints
+
+static void __kmp_stg_print_mwait_hints(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_mwait_hints);
+} // __kmp_stg_print_mwait_hints
+
+#endif // KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+
 // -----------------------------------------------------------------------------
 // OMP_DISPLAY_ENV
 
@@ -4695,6 +4721,27 @@ static void __kmp_stg_print_omp_tool_libraries(kmp_str_buf_t *buffer,
   }
 } // __kmp_stg_print_omp_tool_libraries
 
+static char *__kmp_tool_verbose_init = NULL;
+
+static void __kmp_stg_parse_omp_tool_verbose_init(char const *name,
+                                                  char const *value, void *data) {
+  __kmp_stg_parse_str(name, value, &__kmp_tool_verbose_init);
+} // __kmp_stg_parse_omp_tool_libraries
+
+static void __kmp_stg_print_omp_tool_verbose_init(kmp_str_buf_t *buffer,
+                                                  char const *name, void *data) {
+  if (__kmp_tool_verbose_init)
+    __kmp_stg_print_str(buffer, name, __kmp_tool_libraries);
+  else {
+    if (__kmp_env_format) {
+      KMP_STR_BUF_PRINT_NAME;
+    } else {
+      __kmp_str_buf_print(buffer, "   %s", name);
+    }
+    __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+  }
+} // __kmp_stg_print_omp_tool_verbose_init
+
 #endif
 
 // Table.
@@ -4937,8 +4984,16 @@ static kmp_setting_t __kmp_stg_table[] = {
      0},
     {"OMP_TOOL_LIBRARIES", __kmp_stg_parse_omp_tool_libraries,
      __kmp_stg_print_omp_tool_libraries, NULL, 0, 0},
+    {"OMP_TOOL_VERBOSE_INIT", __kmp_stg_parse_omp_tool_verbose_init,
+     __kmp_stg_print_omp_tool_verbose_init, NULL, 0, 0},
 #endif
 
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+    {"KMP_USER_LEVEL_MWAIT", __kmp_stg_parse_user_level_mwait,
+     __kmp_stg_print_user_level_mwait, NULL, 0, 0},
+    {"KMP_MWAIT_HINTS", __kmp_stg_parse_mwait_hints,
+     __kmp_stg_print_mwait_hints, NULL, 0, 0},
+#endif
     {"", NULL, NULL, NULL, 0, 0}}; // settings
 
 static int const __kmp_stg_count =
@@ -5720,7 +5775,11 @@ void __kmp_env_print() {
 } // __kmp_env_print
 
 void __kmp_env_print_2() {
+  __kmp_display_env_impl(__kmp_display_env, __kmp_display_env_verbose);
+} // __kmp_env_print_2
+
 
+void __kmp_display_env_impl(int display_env, int display_env_verbose) {
   kmp_env_blk_t block;
   kmp_str_buf_t buffer;
 
@@ -5737,9 +5796,9 @@ void __kmp_env_print_2() {
 
   for (int i = 0; i < __kmp_stg_count; ++i) {
     if (__kmp_stg_table[i].print != NULL &&
-        ((__kmp_display_env &&
+        ((display_env &&
           strncmp(__kmp_stg_table[i].name, "OMP_", 4) == 0) ||
-         __kmp_display_env_verbose)) {
+         display_env_verbose)) {
       __kmp_stg_table[i].print(&buffer, __kmp_stg_table[i].name,
                                __kmp_stg_table[i].data);
     }
@@ -5754,7 +5813,6 @@ void __kmp_env_print_2() {
   __kmp_str_buf_free(&buffer);
 
   __kmp_printf("\n");
-
-} // __kmp_env_print_2
+}
 
 // end of file
diff --git a/runtime/src/kmp_settings.h b/runtime/src/kmp_settings.h
index 3247ffc6a..d61c40694 100644
--- a/runtime/src/kmp_settings.h
+++ b/runtime/src/kmp_settings.h
@@ -17,6 +17,7 @@ void __kmp_reset_global_vars(void);
 void __kmp_env_initialize(char const *);
 void __kmp_env_print();
 void __kmp_env_print_2();
+void __kmp_display_env_impl(int display_env, int display_env_verbose);
 
 int __kmp_initial_threads_capacity(int req_nproc);
 void __kmp_init_dflt_team_nth();
diff --git a/runtime/src/kmp_stats.cpp b/runtime/src/kmp_stats.cpp
index 71f2dd93b..280c4738c 100644
--- a/runtime/src/kmp_stats.cpp
+++ b/runtime/src/kmp_stats.cpp
@@ -67,11 +67,13 @@ static uint32_t statsPrinted = 0;
 // output interface
 static kmp_stats_output_module *__kmp_stats_global_output = NULL;
 
-double logHistogram::binMax[] = {
-    1.e1l,  1.e2l,  1.e3l,  1.e4l,  1.e5l,  1.e6l,  1.e7l,  1.e8l,
-    1.e9l,  1.e10l, 1.e11l, 1.e12l, 1.e13l, 1.e14l, 1.e15l, 1.e16l,
-    1.e17l, 1.e18l, 1.e19l, 1.e20l, 1.e21l, 1.e22l, 1.e23l, 1.e24l,
-    1.e25l, 1.e26l, 1.e27l, 1.e28l, 1.e29l, 1.e30l};
+double logHistogram::binMax[] = {1.e1l, 1.e2l, 1.e3l, 1.e4l, 1.e5l, 1.e6l,
+                                 1.e7l, 1.e8l, 1.e9l, 1.e10l, 1.e11l, 1.e12l,
+                                 1.e13l, 1.e14l, 1.e15l, 1.e16l, 1.e17l, 1.e18l,
+                                 1.e19l, 1.e20l, 1.e21l, 1.e22l, 1.e23l, 1.e24l,
+                                 1.e25l, 1.e26l, 1.e27l, 1.e28l, 1.e29l, 1.e30l,
+                                 // Always have infinity be the last value
+                                 std::numeric_limits<double>::infinity()};
 
 /* ************* statistic member functions ************* */
 
@@ -133,7 +135,7 @@ void statistic::scale(double factor) {
 }
 
 std::string statistic::format(char unit, bool total) const {
-  std::string result = formatSI(sampleCount, 9, ' ');
+  std::string result = formatSI((double)sampleCount, 9, ' ');
 
   if (sampleCount == 0) {
     result = result + std::string(", ") + formatSI(0.0, 9, unit);
@@ -181,13 +183,10 @@ uint32_t logHistogram::findBin(double sample) {
   // According to a micro-architect this is likely to be faster than a binary
   // search, since
   // it will only have one branch mis-predict
-  for (int b = 0; b < numBins; b++)
+  for (int b = 0; b < numBins - 1; b++)
     if (binMax[b] > v)
       return b;
-  fprintf(stderr,
-          "Trying to add a sample that is too large into a histogram\n");
-  KMP_ASSERT(0);
-  return -1;
+  return numBins - 1;
 }
 
 void logHistogram::addSample(double sample) {
@@ -224,8 +223,12 @@ std::string logHistogram::format(char unit) const {
     result << "\n";
   }
   for (int i = minBin(); i <= maxBin(); i++) {
-    result << "10**" << i << "<=v<10**" << (i + 1) << ", "
-           << formatSI(count(i), 9, ' ') << ", " << formatSI(total(i), 9, unit);
+    result << "10**" << i << "<=v<";
+    if (i + 1 == numBins - 1)
+      result << "infinity, ";
+    else
+      result << "10**" << (i + 1) << ", ";
+    result << formatSI(count(i), 9, ' ') << ", " << formatSI(total(i), 9, unit);
     if (i != maxBin())
       result << "\n";
   }
@@ -270,7 +273,7 @@ void explicitTimer::stop(tsc_tick_count tick,
 /* ************* partitionedTimers member functions ************* */
 partitionedTimers::partitionedTimers() { timer_stack.reserve(8); }
 
-// initialize the paritioned timers to an initial timer
+// initialize the partitioned timers to an initial timer
 void partitionedTimers::init(explicitTimer timer) {
   KMP_DEBUG_ASSERT(this->timer_stack.size() == 0);
   timer_stack.push_back(timer);
@@ -461,7 +464,7 @@ int kmp_stats_output_module::printPerThreadFlag = 0;
 int kmp_stats_output_module::printPerThreadEventsFlag = 0;
 
 static char const *lastName(char *name) {
-  int l = strlen(name);
+  int l = (int)strlen(name);
   for (int i = l - 1; i >= 0; --i) {
     if (name[i] == '.')
       name[i] = '_';
@@ -609,7 +612,7 @@ void kmp_stats_output_module::printTimerStats(FILE *statsOut,
               totalStats[s].format(tag, true).c_str());
   }
 
-  // Print historgram of statistics
+  // Print histogram of statistics
   if (theStats[0].haveHist()) {
     fprintf(statsOut, "\nTimer distributions\n");
     for (int s = 0; s < TIMER_LAST; s++) {
@@ -656,7 +659,7 @@ void kmp_stats_output_module::printCounters(FILE *statsOut,
   for (int c = 0; c < COUNTER_LAST; c++) {
     counter const *stat = &theCounters[c];
     fprintf(statsOut, "%-25s, %s\n", counter::name(counter_e(c)),
-            formatSI(stat->getValue(), 9, ' ').c_str());
+            formatSI((double)stat->getValue(), 9, ' ').c_str());
   }
 }
 
@@ -679,7 +682,7 @@ void kmp_stats_output_module::printEvents(FILE *eventsOut,
 
 void kmp_stats_output_module::windupExplicitTimers() {
   // Wind up any explicit timers. We assume that it's fair at this point to just
-  // walk all the explcit timers in all threads and say "it's over".
+  // walk all the explicit timers in all threads and say "it's over".
   // If the timer wasn't running, this won't record anything anyway.
   kmp_stats_list::iterator it;
   for (it = __kmp_stats_list->begin(); it != __kmp_stats_list->end(); it++) {
@@ -692,8 +695,7 @@ void kmp_stats_output_module::windupExplicitTimers() {
 void kmp_stats_output_module::printPloticusFile() {
   int i;
   int size = __kmp_stats_list->size();
-  FILE *plotOut = fopen(plotFileName, "w+");
-
+  kmp_safe_raii_file_t plotOut(plotFileName, "w+");
   fprintf(plotOut, "#proc page\n"
                    "   pagesize: 15 10\n"
                    "   scale: 1.0\n\n");
@@ -746,7 +748,6 @@ void kmp_stats_output_module::printPloticusFile() {
   fprintf(plotOut, "#proc legend\n"
                    "   format: down\n"
                    "   location: max max\n\n");
-  fclose(plotOut);
   return;
 }
 
@@ -797,14 +798,16 @@ void kmp_stats_output_module::outputStats(const char *heading) {
                                        normal timer stats */
   statistic allCounters[COUNTER_LAST];
 
-  FILE *statsOut =
-      !outputFileName.empty() ? fopen(outputFileName.c_str(), "a+") : stderr;
-  if (!statsOut)
-    statsOut = stderr;
+  kmp_safe_raii_file_t statsOut;
+  if (!outputFileName.empty()) {
+    statsOut.open(outputFileName.c_str(), "a+");
+  } else {
+    statsOut.set_stderr();
+  }
 
-  FILE *eventsOut;
+  kmp_safe_raii_file_t eventsOut;
   if (eventPrintingEnabled()) {
-    eventsOut = fopen(eventsFileName, "w+");
+    eventsOut.open(eventsFileName, "w+");
   }
 
   printHeaderInfo(statsOut);
@@ -849,22 +852,18 @@ void kmp_stats_output_module::outputStats(const char *heading) {
     for (counter_e c = counter_e(0); c < COUNTER_LAST; c = counter_e(c + 1)) {
       if (counter::masterOnly(c) && t != 0)
         continue;
-      allCounters[c].addSample((*it)->getCounter(c)->getValue());
+      allCounters[c].addSample((double)(*it)->getCounter(c)->getValue());
     }
   }
 
   if (eventPrintingEnabled()) {
     printPloticusFile();
-    fclose(eventsOut);
   }
 
   fprintf(statsOut, "Aggregate for all threads\n");
   printTimerStats(statsOut, &allStats[0], &totalStats[0]);
   fprintf(statsOut, "\n");
   printCounterStats(statsOut, &allCounters[0]);
-
-  if (statsOut != stderr)
-    fclose(statsOut);
 }
 
 /* *************  exported C functions ************** */
diff --git a/runtime/src/kmp_stats.h b/runtime/src/kmp_stats.h
index ee95658fd..7f4a9492b 100644
--- a/runtime/src/kmp_stats.h
+++ b/runtime/src/kmp_stats.h
@@ -195,7 +195,7 @@ enum stats_state_e {
 //                                from a dynamically scheduled loop
 // OMP_critical           -- Time thread spends executing critical section
 // OMP_critical_wait      -- Time thread spends waiting to enter
-//                           a critcal seciton
+//                           a critical section
 // OMP_single             -- Time spent executing a "single" region
 // OMP_master             -- Time spent executing a "master" region
 // OMP_task_immediate     -- Time spent executing non-deferred tasks
@@ -258,6 +258,7 @@ enum stats_state_e {
   macro(KMP_tree_release, 0, arg)                                              \
   macro(USER_resume, 0, arg)                                                   \
   macro(USER_suspend, 0, arg)                                                  \
+  macro(USER_mwait, 0, arg)                                                    \
   macro(KMP_allocate_team, 0, arg)                                             \
   macro(KMP_setup_icv_copy, 0, arg)                                            \
   macro(USER_icv_copy, 0, arg)                                                 \
@@ -422,7 +423,7 @@ class statistic {
   void setOffset(double d) { offset = d; }
 
   void reset() {
-    minVal = std::numeric_limits<double>::max();
+    minVal = (std::numeric_limits<double>::max)();
     maxVal = -minVal;
     meanVal = 0.0;
     m2 = 0.0;
@@ -522,7 +523,7 @@ class partitionedTimers {
   void windup();
 };
 
-// Special wrapper around the partioned timers to aid timing code blocks
+// Special wrapper around the partitioned timers to aid timing code blocks
 // It avoids the need to have an explicit end, leaving the scope suffices.
 class blockPartitionedTimer {
   partitionedTimers *part_timers;
@@ -885,7 +886,7 @@ extern kmp_stats_output_module __kmp_stats_output;
  * @ingroup STATS_GATHERING
 */
 #define KMP_COUNT_VALUE(name, value)                                           \
-  __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample(value)
+  __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample((double)value)
 
 /*!
  * \brief Increments specified counter (name).
@@ -920,7 +921,7 @@ extern kmp_stats_output_module __kmp_stats_output;
 #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
 
 /*!
- * \brief Initializes the paritioned timers to begin with name.
+ * \brief Initializes the partitioned timers to begin with name.
  *
  * @param name timer which you want this thread to begin with
  *
diff --git a/runtime/src/kmp_str.cpp b/runtime/src/kmp_str.cpp
index fb748d1a5..6838bffec 100644
--- a/runtime/src/kmp_str.cpp
+++ b/runtime/src/kmp_str.cpp
@@ -77,7 +77,7 @@ void __kmp_str_buf_clear(kmp_str_buf_t *buffer) {
   KMP_STR_BUF_INVARIANT(buffer);
 } // __kmp_str_buf_clear
 
-void __kmp_str_buf_reserve(kmp_str_buf_t *buffer, int size) {
+void __kmp_str_buf_reserve(kmp_str_buf_t *buffer, size_t size) {
   KMP_STR_BUF_INVARIANT(buffer);
   KMP_DEBUG_ASSERT(size >= 0);
 
@@ -131,14 +131,15 @@ void __kmp_str_buf_free(kmp_str_buf_t *buffer) {
   KMP_STR_BUF_INVARIANT(buffer);
 } // __kmp_str_buf_free
 
-void __kmp_str_buf_cat(kmp_str_buf_t *buffer, char const *str, int len) {
+void __kmp_str_buf_cat(kmp_str_buf_t *buffer, char const *str, size_t len) {
   KMP_STR_BUF_INVARIANT(buffer);
   KMP_DEBUG_ASSERT(str != NULL);
   KMP_DEBUG_ASSERT(len >= 0);
+
   __kmp_str_buf_reserve(buffer, buffer->used + len + 1);
   KMP_MEMCPY(buffer->str + buffer->used, str, len);
   buffer->str[buffer->used + len] = 0;
-  buffer->used += len;
+  __kmp_type_convert(buffer->used + len, &(buffer->used));
   KMP_STR_BUF_INVARIANT(buffer);
 } // __kmp_str_buf_cat
 
@@ -251,7 +252,7 @@ void __kmp_str_fname_init(kmp_str_fname_t *fname, char const *path) {
     char *base = NULL; // Pointer to the beginning of basename.
     fname->path = __kmp_str_format("%s", path);
     // Original code used strdup() function to copy a string, but on Windows* OS
-    // Intel(R) 64 it causes assertioon id debug heap, so I had to replace
+    // Intel(R) 64 it causes assertion id debug heap, so I had to replace
     // strdup with __kmp_str_format().
     if (KMP_OS_WINDOWS) {
       __kmp_str_replace(fname->path, '\\', '/');
@@ -260,7 +261,7 @@ void __kmp_str_fname_init(kmp_str_fname_t *fname, char const *path) {
     slash = strrchr(fname->dir, '/');
     if (KMP_OS_WINDOWS &&
         slash == NULL) { // On Windows* OS, if slash not found,
-      char first = TOLOWER(fname->dir[0]); // look for drive.
+      char first = (char)TOLOWER(fname->dir[0]); // look for drive.
       if ('a' <= first && first <= 'z' && fname->dir[1] == ':') {
         slash = &fname->dir[1];
       }
@@ -295,7 +296,54 @@ int __kmp_str_fname_match(kmp_str_fname_t const *fname, char const *pattern) {
   return dir_match && base_match;
 } // __kmp_str_fname_match
 
-kmp_str_loc_t __kmp_str_loc_init(char const *psource, int init_fname) {
+// Get the numeric fields from source location string.
+// For clang these fields are Line/Col of the start of the construct.
+// For icc these are LineBegin/LineEnd of the construct.
+// Function is fast as it does not duplicate string (which involves memory
+// allocation), and parses the string in place.
+void __kmp_str_loc_numbers(char const *Psource, int *LineBeg,
+                           int *LineEndOrCol) {
+  char *Str;
+  KMP_DEBUG_ASSERT(LineBeg);
+  KMP_DEBUG_ASSERT(LineEndOrCol);
+  // Parse Psource string ";file;func;line;line_end_or_column;;" to get
+  // numbers only, skipping string fields "file" and "func".
+
+  // Find 1-st semicolon.
+  KMP_DEBUG_ASSERT(Psource);
+#ifdef __cplusplus
+  Str = strchr(CCAST(char *, Psource), ';');
+#else
+  Str = strchr(Psource, ';');
+#endif
+  // Check returned pointer to see if the format of Psource is broken.
+  if (Str) {
+    // Find 2-nd semicolon.
+    Str = strchr(Str + 1, ';');
+  }
+  if (Str) {
+    // Find 3-rd semicolon.
+    Str = strchr(Str + 1, ';');
+  }
+  if (Str) {
+    // Read begin line number.
+    *LineBeg = atoi(Str + 1);
+    // Find 4-th semicolon.
+    Str = strchr(Str + 1, ';');
+  } else {
+    // Broken format of input string, cannot read the number.
+    *LineBeg = 0;
+  }
+  if (Str) {
+    // Read end line or column number.
+    *LineEndOrCol = atoi(Str + 1);
+  } else {
+    // Broken format of input string, cannot read the number.
+    *LineEndOrCol = 0;
+  }
+}
+
+kmp_str_loc_t __kmp_str_loc_init(char const *psource, bool init_fname) {
   kmp_str_loc_t loc;
 
   loc._bulk = NULL;
diff --git a/runtime/src/kmp_str.h b/runtime/src/kmp_str.h
index 09faadb68..ff6179908 100644
--- a/runtime/src/kmp_str.h
+++ b/runtime/src/kmp_str.h
@@ -46,10 +46,10 @@ typedef struct kmp_str_buf kmp_str_buf_t;
   }
 
 void __kmp_str_buf_clear(kmp_str_buf_t *buffer);
-void __kmp_str_buf_reserve(kmp_str_buf_t *buffer, int size);
+void __kmp_str_buf_reserve(kmp_str_buf_t *buffer, size_t size);
 void __kmp_str_buf_detach(kmp_str_buf_t *buffer);
 void __kmp_str_buf_free(kmp_str_buf_t *buffer);
-void __kmp_str_buf_cat(kmp_str_buf_t *buffer, char const *str, int len);
+void __kmp_str_buf_cat(kmp_str_buf_t *buffer, char const *str, size_t len);
 void __kmp_str_buf_catbuf(kmp_str_buf_t *dest, const kmp_str_buf_t *src);
 int __kmp_str_buf_vprint(kmp_str_buf_t *buffer, char const *format,
                          va_list args);
@@ -72,16 +72,16 @@ struct kmp_str_fname {
 typedef struct kmp_str_fname kmp_str_fname_t;
 void __kmp_str_fname_init(kmp_str_fname_t *fname, char const *path);
 void __kmp_str_fname_free(kmp_str_fname_t *fname);
-// Compares file name with specified patern. If pattern is NULL, any fname
+// Compares file name with specified pattern. If pattern is NULL, any fname
 // matched.
 int __kmp_str_fname_match(kmp_str_fname_t const *fname, char const *pattern);
 
 /* The compiler provides source locations in string form
-   ";file;func;line;col;;". It is not convenient for manupulation. This
+   ";file;func;line;col;;". It is not convenient for manipulation. This
    structure keeps source location in more convenient form.
    Usage:
 
-   kmp_str_loc_t loc = __kmp_str_loc_init( ident->psource, 0 );
+   kmp_str_loc_t loc = __kmp_str_loc_init(ident->psource, false);
    // use loc.file, loc.func, loc.line, loc.col.
    // loc.fname is available if second argument of __kmp_str_loc_init is true.
    __kmp_str_loc_free( & loc );
@@ -98,7 +98,8 @@ struct kmp_str_loc {
   int col;
 }; // struct kmp_str_loc
 typedef struct kmp_str_loc kmp_str_loc_t;
-kmp_str_loc_t __kmp_str_loc_init(char const *psource, int init_fname);
+kmp_str_loc_t __kmp_str_loc_init(char const *psource, bool init_fname);
+void __kmp_str_loc_numbers(char const *Psource, int *Line, int *Col);
 void __kmp_str_loc_free(kmp_str_loc_t *loc);
 
 int __kmp_str_eqf(char const *lhs, char const *rhs);
diff --git a/runtime/src/kmp_stub.cpp b/runtime/src/kmp_stub.cpp
index 6b5041988..58add6b6a 100644
--- a/runtime/src/kmp_stub.cpp
+++ b/runtime/src/kmp_stub.cpp
@@ -125,7 +125,7 @@ int kmpc_get_affinity_mask_proc(int proc, void **mask) {
 /* kmp API functions */
 void kmp_set_stacksize(omp_int_t arg) {
   i;
-  __kmps_set_stacksize(arg);
+  __kmps_set_stacksize((size_t)arg);
 }
 void kmp_set_stacksize_s(size_t arg) {
   i;
@@ -147,7 +147,7 @@ void *kmp_malloc(size_t size) {
   i;
   void *res;
 #if KMP_OS_WINDOWS
-  // If succesfull returns a pointer to the memory block, otherwise returns
+  // If successful returns a pointer to the memory block, otherwise returns
   // NULL.
   // Sets errno to ENOMEM or EINVAL if memory allocation failed or parameter
   // validation failed.
@@ -250,12 +250,12 @@ int __kmps_get_nested(void) {
 
 static size_t __kmps_stacksize = KMP_DEFAULT_STKSIZE;
 
-void __kmps_set_stacksize(int arg) {
+void __kmps_set_stacksize(size_t arg) {
   i;
   __kmps_stacksize = arg;
 } // __kmps_set_stacksize
 
-int __kmps_get_stacksize(void) {
+size_t __kmps_get_stacksize(void) {
   i;
   return __kmps_stacksize;
 } // __kmps_get_stacksize
@@ -366,6 +366,17 @@ void *omp_alloc(size_t size, const omp_allocator_handle_t allocator) {
   i;
   return malloc(size);
 }
+void *omp_calloc(size_t nmemb, size_t size,
+                 const omp_allocator_handle_t allocator) {
+  i;
+  return calloc(nmemb, size);
+}
+void *omp_realloc(void *ptr, size_t size,
+                  const omp_allocator_handle_t allocator,
+                  const omp_allocator_handle_t free_allocator) {
+  i;
+  return realloc(ptr, size);
+}
 void omp_free(void *ptr, const omp_allocator_handle_t allocator) {
   i;
   free(ptr);
diff --git a/runtime/src/kmp_stub.h b/runtime/src/kmp_stub.h
index 679c07b16..caaf783fe 100644
--- a/runtime/src/kmp_stub.h
+++ b/runtime/src/kmp_stub.h
@@ -25,8 +25,8 @@ void __kmps_set_library(int arg);
 int __kmps_get_library(void);
 void __kmps_set_nested(int arg);
 int __kmps_get_nested(void);
-void __kmps_set_stacksize(int arg);
-int __kmps_get_stacksize();
+void __kmps_set_stacksize(size_t arg);
+size_t __kmps_get_stacksize();
 
 #ifndef KMP_SCHED_TYPE_DEFINED
 #define KMP_SCHED_TYPE_DEFINED
diff --git a/runtime/src/kmp_taskdeps.cpp b/runtime/src/kmp_taskdeps.cpp
index f8aa51dd9..5c200210d 100644
--- a/runtime/src/kmp_taskdeps.cpp
+++ b/runtime/src/kmp_taskdeps.cpp
@@ -35,7 +35,7 @@ static std::atomic<kmp_int32> kmp_node_id_seed = ATOMIC_VAR_INIT(0);
 
 static void __kmp_init_node(kmp_depnode_t *node) {
   node->dn.successors = NULL;
-  node->dn.task = NULL; // will point to the rigth task
+  node->dn.task = NULL; // will point to the right task
   // once dependences have been processed
   for (int i = 0; i < MAX_MTX_DEPS; ++i)
     node->dn.mtx_locks[i] = NULL;
@@ -57,7 +57,7 @@ enum { KMP_DEPHASH_OTHER_SIZE = 97, KMP_DEPHASH_MASTER_SIZE = 997 };
 size_t sizes[] = { 997, 2003, 4001, 8191, 16001, 32003, 64007, 131071, 270029 };
 const size_t MAX_GEN = 8;
 
-static inline kmp_int32 __kmp_dephash_hash(kmp_intptr_t addr, size_t hsize) {
+static inline size_t __kmp_dephash_hash(kmp_intptr_t addr, size_t hsize) {
   // TODO alternate to try: set = (((Addr64)(addrUsefulBits * 9.618)) %
   // m_num_sets );
   return ((addr >> 6) ^ (addr >> 2)) % hsize;
@@ -72,7 +72,7 @@ static kmp_dephash_t *__kmp_dephash_extend(kmp_info_t *thread,
     return current_dephash;
   size_t new_size = sizes[gen];
 
-  kmp_int32 size_to_allocate =
+  size_t size_to_allocate =
       new_size * sizeof(kmp_dephash_entry_t *) + sizeof(kmp_dephash_t);
 
 #if USE_FAST_MEMORY
@@ -85,19 +85,19 @@ static kmp_dephash_t *__kmp_dephash_extend(kmp_info_t *thread,
   h->nelements = current_dephash->nelements;
   h->buckets = (kmp_dephash_entry **)(h + 1);
   h->generation = gen;
-
+  h->nconflicts = 0;
   // insert existing elements in the new table
   for (size_t i = 0; i < current_dephash->size; i++) {
-    kmp_dephash_entry_t *next;
-    for (kmp_dephash_entry_t *entry = current_dephash->buckets[i]; entry; entry = next) {
+    kmp_dephash_entry_t *next, *entry;
+    for (entry = current_dephash->buckets[i]; entry; entry = next) {
       next = entry->next_in_bucket;
       // Compute the new hash using the new size, and insert the entry in
       // the new bucket.
-      kmp_int32 new_bucket = __kmp_dephash_hash(entry->addr, h->size);
+      size_t new_bucket = __kmp_dephash_hash(entry->addr, h->size);
+      entry->next_in_bucket = h->buckets[new_bucket];
       if (entry->next_in_bucket) {
         h->nconflicts++;
       }
-      entry->next_in_bucket = h->buckets[new_bucket];
       h->buckets[new_bucket] = entry;
     }
   }
@@ -123,8 +123,7 @@ static kmp_dephash_t *__kmp_dephash_create(kmp_info_t *thread,
   else
     h_size = KMP_DEPHASH_OTHER_SIZE;
 
-  kmp_int32 size =
-      h_size * sizeof(kmp_dephash_entry_t *) + sizeof(kmp_dephash_t);
+  size_t size = h_size * sizeof(kmp_dephash_entry_t *) + sizeof(kmp_dephash_t);
 
 #if USE_FAST_MEMORY
   h = (kmp_dephash_t *)__kmp_fast_allocate(thread, size);
@@ -155,7 +154,7 @@ __kmp_dephash_find(kmp_info_t *thread, kmp_dephash_t **hash, kmp_intptr_t addr)
     *hash = __kmp_dephash_extend(thread, h);
     h = *hash;
   }
-  kmp_int32 bucket = __kmp_dephash_hash(addr, h->size);
+  size_t bucket = __kmp_dephash_hash(addr, h->size);
 
   kmp_dephash_entry_t *entry;
   for (entry = h->buckets[bucket]; entry; entry = entry->next_in_bucket)
@@ -205,7 +204,7 @@ static kmp_depnode_list_t *__kmp_add_node(kmp_info_t *thread,
   return new_head;
 }
 
-static inline void __kmp_track_dependence(kmp_depnode_t *source,
+static inline void __kmp_track_dependence(kmp_int32 gtid, kmp_depnode_t *source,
                                           kmp_depnode_t *sink,
                                           kmp_task_t *sink_task) {
 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
@@ -224,11 +223,14 @@ static inline void __kmp_track_dependence(kmp_depnode_t *source,
      */
   if (ompt_enabled.ompt_callback_task_dependence) {
     kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
-    kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task);
+    ompt_data_t *sink_data;
+    if (sink_task)
+      sink_data = &(KMP_TASK_TO_TASKDATA(sink_task)->ompt_task_info.task_data);
+    else
+      sink_data = &__kmp_threads[gtid]->th.ompt_thread_info.task_data;
 
     ompt_callbacks.ompt_callback(ompt_callback_task_dependence)(
-        &(task_source->ompt_task_info.task_data),
-        &(task_sink->ompt_task_info.task_data));
+        &(task_source->ompt_task_info.task_data), sink_data);
   }
 #endif /* OMPT_SUPPORT && OMPT_OPTIONAL */
 }
@@ -246,7 +248,7 @@ __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread,
     if (dep->dn.task) {
       KMP_ACQUIRE_DEPNODE(gtid, dep);
       if (dep->dn.task) {
-        __kmp_track_dependence(dep, node, task);
+        __kmp_track_dependence(gtid, dep, node, task);
         dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node);
         KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
                       "%p\n",
@@ -272,7 +274,7 @@ static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
     // synchronously add source to sink' list of successors
     KMP_ACQUIRE_DEPNODE(gtid, sink);
     if (sink->dn.task) {
-      __kmp_track_dependence(sink, source, task);
+      __kmp_track_dependence(gtid, sink, source, task);
       sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source);
       KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
                     "%p\n",
@@ -417,7 +419,7 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
 #endif
   KA_TRACE(20, ("__kmp_check_deps: T#%d checking dependencies for task %p : %d "
-                "possibly aliased dependencies, %d non-aliased depedencies : "
+                "possibly aliased dependencies, %d non-aliased dependencies : "
                 "dep_barrier=%d .\n",
                 gtid, taskdata, ndeps, ndeps_noalias, dep_barrier));
 
@@ -473,8 +475,8 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
   npredecessors++;
 
   // Update predecessors and obtain current value to check if there are still
-  // any outstandig dependences (some tasks may have finished while we processed
-  // the dependences)
+  // any outstanding dependences (some tasks may have finished while we
+  // processed the dependences)
   npredecessors =
       node->dn.npredecessors.fetch_add(npredecessors) + npredecessors;
 
@@ -498,7 +500,7 @@ task''
 @param noalias_dep_list List of depend items with no aliasing
 
 @return Returns either TASK_CURRENT_NOT_QUEUED if the current task was not
-suspendend and queued, or TASK_CURRENT_QUEUED if it was suspended and queued
+suspended and queued, or TASK_CURRENT_QUEUED if it was suspended and queued
 
 Schedule a non-thread-switchable task with dependences for execution
 */
@@ -511,13 +513,12 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
   KA_TRACE(10, ("__kmpc_omp_task_with_deps(enter): T#%d loc=%p task=%p\n", gtid,
                 loc_ref, new_taskdata));
-
+  __kmp_assert_valid_gtid(gtid);
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *current_task = thread->th.th_current_task;
 
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
-    OMPT_STORE_RETURN_ADDRESS(gtid);
     if (!current_task->ompt_task_info.frame.enter_frame.ptr)
       current_task->ompt_task_info.frame.enter_frame.ptr =
           OMPT_GET_FRAME_ADDRESS(0);
@@ -528,7 +529,7 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
           current_task ? &(current_task->ompt_task_info.frame) : NULL,
           &(new_taskdata->ompt_task_info.task_data),
           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 1,
-          OMPT_LOAD_RETURN_ADDRESS(gtid));
+          OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid));
     }
 
     new_taskdata->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
@@ -540,47 +541,40 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
       ompt_enabled.ompt_callback_dependences) {
     kmp_int32 i;
 
-    new_taskdata->ompt_task_info.ndeps = ndeps + ndeps_noalias;
-    new_taskdata->ompt_task_info.deps =
-        (ompt_dependence_t *)KMP_OMPT_DEPS_ALLOC(
-            thread, (ndeps + ndeps_noalias) * sizeof(ompt_dependence_t));
+    int ompt_ndeps = ndeps + ndeps_noalias;
+    ompt_dependence_t *ompt_deps = (ompt_dependence_t *)KMP_OMPT_DEPS_ALLOC(
+        thread, (ndeps + ndeps_noalias) * sizeof(ompt_dependence_t));
 
-    KMP_ASSERT(new_taskdata->ompt_task_info.deps != NULL);
+    KMP_ASSERT(ompt_deps != NULL);
 
     for (i = 0; i < ndeps; i++) {
-      new_taskdata->ompt_task_info.deps[i].variable.ptr =
-          (void *)dep_list[i].base_addr;
+      ompt_deps[i].variable.ptr = (void *)dep_list[i].base_addr;
       if (dep_list[i].flags.in && dep_list[i].flags.out)
-        new_taskdata->ompt_task_info.deps[i].dependence_type =
-            ompt_dependence_type_inout;
+        ompt_deps[i].dependence_type = ompt_dependence_type_inout;
       else if (dep_list[i].flags.out)
-        new_taskdata->ompt_task_info.deps[i].dependence_type =
-            ompt_dependence_type_out;
+        ompt_deps[i].dependence_type = ompt_dependence_type_out;
       else if (dep_list[i].flags.in)
-        new_taskdata->ompt_task_info.deps[i].dependence_type =
-            ompt_dependence_type_in;
+        ompt_deps[i].dependence_type = ompt_dependence_type_in;
+      else if (dep_list[i].flags.mtx)
+        ompt_deps[i].dependence_type = ompt_dependence_type_mutexinoutset;
     }
     for (i = 0; i < ndeps_noalias; i++) {
-      new_taskdata->ompt_task_info.deps[ndeps + i].variable.ptr =
-          (void *)noalias_dep_list[i].base_addr;
+      ompt_deps[ndeps + i].variable.ptr = (void *)noalias_dep_list[i].base_addr;
       if (noalias_dep_list[i].flags.in && noalias_dep_list[i].flags.out)
-        new_taskdata->ompt_task_info.deps[ndeps + i].dependence_type =
-            ompt_dependence_type_inout;
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inout;
       else if (noalias_dep_list[i].flags.out)
-        new_taskdata->ompt_task_info.deps[ndeps + i].dependence_type =
-            ompt_dependence_type_out;
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_out;
       else if (noalias_dep_list[i].flags.in)
-        new_taskdata->ompt_task_info.deps[ndeps + i].dependence_type =
-            ompt_dependence_type_in;
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_in;
+      else if (noalias_dep_list[i].flags.mtx)
+        ompt_deps[ndeps + i].dependence_type =
+            ompt_dependence_type_mutexinoutset;
     }
     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
-        &(new_taskdata->ompt_task_info.task_data),
-        new_taskdata->ompt_task_info.deps, new_taskdata->ompt_task_info.ndeps);
+        &(new_taskdata->ompt_task_info.task_data), ompt_deps, ompt_ndeps);
     /* We can now free the allocated memory for the dependencies */
-    /* For OMPD we might want to delay the free until task_end */
-    KMP_OMPT_DEPS_FREE(thread, new_taskdata->ompt_task_info.deps);
-    new_taskdata->ompt_task_info.deps = NULL;
-    new_taskdata->ompt_task_info.ndeps = 0;
+    /* For OMPD we might want to delay the free until end of this function */
+    KMP_OMPT_DEPS_FREE(thread, ompt_deps);
   }
 #endif /* OMPT_OPTIONAL */
 #endif /* OMPT_SUPPORT */
@@ -642,6 +636,23 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
   return ret;
 }
 
+#if OMPT_SUPPORT
+void __ompt_taskwait_dep_finish(kmp_taskdata_t *current_task,
+                                ompt_data_t *taskwait_task_data) {
+  if (ompt_enabled.ompt_callback_task_schedule) {
+    ompt_data_t task_data = ompt_data_none;
+    ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
+        current_task ? &(current_task->ompt_task_info.task_data) : &task_data,
+        ompt_task_switch, taskwait_task_data);
+    ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
+        taskwait_task_data, ompt_task_complete,
+        current_task ? &(current_task->ompt_task_info.task_data) : &task_data);
+  }
+  current_task->ompt_task_info.frame.enter_frame.ptr = NULL;
+  *taskwait_task_data = ompt_data_none;
+}
+#endif /* OMPT_SUPPORT */
+
 /*!
 @ingroup TASKING
 @param loc_ref location of the original task directive
@@ -664,10 +675,78 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
                   gtid, loc_ref));
     return;
   }
-
+  __kmp_assert_valid_gtid(gtid);
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *current_task = thread->th.th_current_task;
 
+#if OMPT_SUPPORT
+  // this function represents a taskwait construct with depend clause
+  // We signal 4 events:
+  //  - creation of the taskwait task
+  //  - dependences of the taskwait task
+  //  - schedule and finish of the taskwait task
+  ompt_data_t *taskwait_task_data = &thread->th.ompt_thread_info.task_data;
+  KMP_ASSERT(taskwait_task_data->ptr == NULL);
+  if (ompt_enabled.enabled) {
+    if (!current_task->ompt_task_info.frame.enter_frame.ptr)
+      current_task->ompt_task_info.frame.enter_frame.ptr =
+          OMPT_GET_FRAME_ADDRESS(0);
+    if (ompt_enabled.ompt_callback_task_create) {
+      ompt_data_t task_data = ompt_data_none;
+      ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+          current_task ? &(current_task->ompt_task_info.task_data) : &task_data,
+          current_task ? &(current_task->ompt_task_info.frame) : NULL,
+          taskwait_task_data,
+          ompt_task_explicit | ompt_task_undeferred | ompt_task_mergeable, 1,
+          OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid));
+    }
+  }
+
+#if OMPT_OPTIONAL
+  /* OMPT grab all dependences if requested by the tool */
+  if (ndeps + ndeps_noalias > 0 && ompt_enabled.ompt_callback_dependences) {
+    kmp_int32 i;
+
+    int ompt_ndeps = ndeps + ndeps_noalias;
+    ompt_dependence_t *ompt_deps = (ompt_dependence_t *)KMP_OMPT_DEPS_ALLOC(
+        thread, (ndeps + ndeps_noalias) * sizeof(ompt_dependence_t));
+
+    KMP_ASSERT(ompt_deps != NULL);
+
+    for (i = 0; i < ndeps; i++) {
+      ompt_deps[i].variable.ptr = (void *)dep_list[i].base_addr;
+      if (dep_list[i].flags.in && dep_list[i].flags.out)
+        ompt_deps[i].dependence_type = ompt_dependence_type_inout;
+      else if (dep_list[i].flags.out)
+        ompt_deps[i].dependence_type = ompt_dependence_type_out;
+      else if (dep_list[i].flags.in)
+        ompt_deps[i].dependence_type = ompt_dependence_type_in;
+      else if (dep_list[i].flags.mtx)
+        ompt_deps[ndeps + i].dependence_type =
+            ompt_dependence_type_mutexinoutset;
+    }
+    for (i = 0; i < ndeps_noalias; i++) {
+      ompt_deps[ndeps + i].variable.ptr = (void *)noalias_dep_list[i].base_addr;
+      if (noalias_dep_list[i].flags.in && noalias_dep_list[i].flags.out)
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inout;
+      else if (noalias_dep_list[i].flags.out)
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_out;
+      else if (noalias_dep_list[i].flags.in)
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_in;
+      else if (noalias_dep_list[i].flags.mtx)
+        ompt_deps[ndeps + i].dependence_type =
+            ompt_dependence_type_mutexinoutset;
+    }
+    ompt_callbacks.ompt_callback(ompt_callback_dependences)(
+        taskwait_task_data, ompt_deps, ompt_ndeps);
+    /* We can now free the allocated memory for the dependencies */
+    /* For OMPD we might want to delay the free until end of this function */
+    KMP_OMPT_DEPS_FREE(thread, ompt_deps);
+    ompt_deps = NULL;
+  }
+#endif /* OMPT_OPTIONAL */
+#endif /* OMPT_SUPPORT */
+
   // We can return immediately as:
   // - dependences are not computed in serial teams (except with proxy tasks)
   // - if the dephash is not yet created it means we have nothing to wait for
@@ -682,11 +761,16 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
     KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking "
                   "dependencies : loc=%p\n",
                   gtid, loc_ref));
+#if OMPT_SUPPORT
+    __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
+#endif /* OMPT_SUPPORT */
     return;
   }
 
   kmp_depnode_t node = {0};
   __kmp_init_node(&node);
+  // the stack owns the node
+  __kmp_node_ref(&node);
 
   if (!__kmp_check_deps(gtid, &node, NULL, &current_task->td_dephash,
                         DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
@@ -694,17 +778,37 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
     KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking "
                   "dependencies : loc=%p\n",
                   gtid, loc_ref));
+#if OMPT_SUPPORT
+    __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
+#endif /* OMPT_SUPPORT */
     return;
   }
 
+#if KMP_USE_ABT
+  while (1) {
+    if (!__kmp_check_deps(gtid, &node, NULL, &current_task->td_dephash,
+                      DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
+                      noalias_dep_list)) {
+      KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d finished waiting : "
+                    "loc=%p\n", gtid, loc_ref));
+      return;
+    }
+    KMP_YIELD(1);
+  }
+#else // KMP_USE_ABT
   int thread_finished = FALSE;
-  kmp_flag_32 flag((std::atomic<kmp_uint32> *)&node.dn.npredecessors, 0U);
+  kmp_flag_32<false, false> flag(
+      (std::atomic<kmp_uint32> *)&node.dn.npredecessors, 0U);
   while (node.dn.npredecessors > 0) {
     flag.execute_tasks(thread, gtid, FALSE,
                        &thread_finished USE_ITT_BUILD_ARG(NULL),
                        __kmp_task_stealing_constraint);
   }
 
+#if OMPT_SUPPORT
+  __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
+#endif /* OMPT_SUPPORT */
   KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d finished waiting : loc=%p\n",
                 gtid, loc_ref));
+#endif // !KMP_USE_ABT
 }
diff --git a/runtime/src/kmp_taskdeps.h b/runtime/src/kmp_taskdeps.h
index 2a712b348..4e5f8851f 100644
--- a/runtime/src/kmp_taskdeps.h
+++ b/runtime/src/kmp_taskdeps.h
@@ -89,6 +89,16 @@ static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) {
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_depnode_t *node = task->td_depnode;
 
+  // Check mutexinoutset dependencies, release locks
+  if (UNLIKELY(node && (node->dn.mtx_num_locks < 0))) {
+    // negative num_locks means all locks were acquired
+    node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
+    for (int i = node->dn.mtx_num_locks - 1; i >= 0; --i) {
+      KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
+      __kmp_release_lock(node->dn.mtx_locks[i], gtid);
+    }
+  }
+
   if (task->td_dephash) {
     KA_TRACE(
         40, ("__kmp_release_deps: T#%d freeing dependencies hash of task %p.\n",
diff --git a/runtime/src/kmp_tasking.cpp b/runtime/src/kmp_tasking.cpp
index d037299f1..742e1cd11 100644
--- a/runtime/src/kmp_tasking.cpp
+++ b/runtime/src/kmp_tasking.cpp
@@ -247,6 +247,7 @@ static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
 }
 #endif /* BUILD_TIED_TASK_STACK */
 
+#if !KMP_USE_ABT
 // returns 1 if new task is allowed to execute, 0 otherwise
 // checks Task Scheduling constraint (if requested) and
 // mutexinoutset dependencies if any
@@ -275,7 +276,7 @@ static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
   }
   // Check mutexinoutset dependencies, acquire locks
   kmp_depnode_t *node = tasknew->td_depnode;
-  if (node && (node->dn.mtx_num_locks > 0)) {
+  if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
     for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
       KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
       if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
@@ -290,6 +291,7 @@ static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
   }
   return true;
 }
+#endif // !KMP_USE_ABT
 
 // __kmp_realloc_task_deque:
 // Re-allocates a task deque for a particular thread, copies the content from
@@ -298,6 +300,7 @@ static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
 static void __kmp_realloc_task_deque(kmp_info_t *thread,
                                      kmp_thread_data_t *thread_data) {
   kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
+  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
   kmp_int32 new_size = 2 * size;
 
   KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
@@ -325,13 +328,15 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
   kmp_task_team_t *task_team = thread->th.th_task_team;
+#if !KMP_USE_ABT
   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
   kmp_thread_data_t *thread_data;
+#endif
 
   KA_TRACE(20,
            ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
 
-  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
+  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
     // untied task needs to increment counter so that the task structure is not
     // freed prematurely
     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
@@ -343,7 +348,7 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
   }
 
   // The first check avoids building task_team thread data if serialized
-  if (taskdata->td_flags.task_serial) {
+  if (UNLIKELY(taskdata->td_flags.task_serial)) {
     KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
                   "TASK_NOT_PUSHED for task %p\n",
                   gtid, taskdata));
@@ -353,17 +358,37 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
   // Now that serialized tasks have returned, we can assume that we are not in
   // immediate exec mode
   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
-  if (!KMP_TASKING_ENABLED(task_team)) {
+  if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
     __kmp_enable_tasking(task_team, thread);
   }
   KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
   KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
 
+#if KMP_USE_ABT
+
+  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
+    if (taskdata->td_flags.executing == 1) {
+      // Since Argobots can really yield an untied task, we do not need to
+      // finish and recreate a thread to handle it.
+      return TASK_SUCCESSFULLY_PUSHED;
+    }
+  }
+  // Because the ABT_tasks are going to be pushed to our internal pools,
+  // all those mechanisms should be avoided and directly push the task.
+  if (!__kmp_abt_create_task(thread, task)) {
+    return TASK_NOT_PUSHED;
+  }
+  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
+                "task=%p\n", gtid, task));
+  return TASK_SUCCESSFULLY_PUSHED;
+
+#else // KMP_USE_ABT
+
   // Find tasking deque specific to encountering thread
   thread_data = &task_team->tt.tt_threads_data[tid];
 
   // No lock needed since only owner can allocate
-  if (thread_data->td.td_deque == NULL) {
+  if (UNLIKELY(thread_data->td.td_deque == NULL)) {
     __kmp_alloc_task_deque(thread, thread_data);
   }
 
@@ -381,8 +406,11 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
     } else {
       __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
       locked = 1;
-      // expand deque to push the task which is not allowed to execute
-      __kmp_realloc_task_deque(thread, thread_data);
+      if (TCR_4(thread_data->td.td_deque_ntasks) >=
+          TASK_DEQUE_SIZE(thread_data->td)) {
+        // expand deque to push the task which is not allowed to execute
+        __kmp_realloc_task_deque(thread, thread_data);
+      }
     }
   }
   // Lock the deque for the task push operation
@@ -416,7 +444,8 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
   TCW_4(thread_data->td.td_deque_ntasks,
         TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
-
+  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
+  KMP_FSYNC_RELEASING(taskdata); // releasing child
   KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
                 "task=%p ntasks=%d head=%u tail=%u\n",
                 gtid, taskdata, thread_data->td.td_deque_ntasks,
@@ -425,6 +454,7 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
 
   return TASK_SUCCESSFULLY_PUSHED;
+#endif // !KMP_USE_ABT
 }
 
 // __kmp_pop_current_task_from_thread: set up current task from called thread
@@ -547,8 +577,6 @@ static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
   task->ompt_task_info.frame.enter_frame = ompt_data_none;
   task->ompt_task_info.frame.exit_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
   task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
-  task->ompt_task_info.ndeps = 0;
-  task->ompt_task_info.deps = NULL;
 }
 
 // __ompt_task_start:
@@ -573,24 +601,20 @@ static inline void __ompt_task_start(kmp_task_t *task,
 
 // __ompt_task_finish:
 //   Build and trigger final task-schedule event
-static inline void
-__ompt_task_finish(kmp_task_t *task, kmp_taskdata_t *resumed_task,
-                   ompt_task_status_t status = ompt_task_complete) {
-  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
-  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
-      taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
-    status = ompt_task_cancel;
-  }
-
-  /* let OMPT know that we're returning to the callee task */
+static inline void __ompt_task_finish(kmp_task_t *task,
+                                      kmp_taskdata_t *resumed_task,
+                                      ompt_task_status_t status) {
   if (ompt_enabled.ompt_callback_task_schedule) {
+    kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+    if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
+        taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
+      status = ompt_task_cancel;
+    }
+
+    /* let OMPT know that we're returning to the callee task */
     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
         &(taskdata->ompt_task_info.task_data), status,
-        &((resumed_task ? resumed_task
-                        : (taskdata->ompt_task_info.scheduling_parent
-                               ? taskdata->ompt_task_info.scheduling_parent
-                               : taskdata->td_parent))
-              ->ompt_task_info.task_data));
+        (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
   }
 }
 #endif
@@ -799,6 +823,10 @@ static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
 // gtid: global thread ID for calling thread
 // task: task to be finished
 // resumed_task: task to be resumed.  (may be NULL if task is serialized)
+//
+// template<ompt>: effectively ompt_enabled.enabled!=0
+// the version with ompt=false is inlined, allowing to optimize away all ompt
+// code in this case
 template <bool ompt>
 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
                               kmp_taskdata_t *resumed_task) {
@@ -821,7 +849,7 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
   }
 #endif /* BUILD_TIED_TASK_STACK */
 
-  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
+  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
     // untied task needs to check the counter so that the task structure is not
     // freed prematurely
     kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
@@ -845,23 +873,38 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
       return;
     }
   }
-#if OMPT_SUPPORT
-  if (ompt)
-    __ompt_task_finish(task, resumed_task);
-#endif
 
-  // Check mutexinoutset dependencies, release locks
-  kmp_depnode_t *node = taskdata->td_depnode;
-  if (node && (node->dn.mtx_num_locks < 0)) {
-    // negative num_locks means all locks were acquired
-    node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
-    for (int i = node->dn.mtx_num_locks - 1; i >= 0; --i) {
-      KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
-      __kmp_release_lock(node->dn.mtx_locks[i], gtid);
+  // bookkeeping for resuming task:
+  // GEH - note tasking_ser => task_serial
+  KMP_DEBUG_ASSERT(
+      (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
+      taskdata->td_flags.task_serial);
+  if (taskdata->td_flags.task_serial) {
+    if (resumed_task == NULL) {
+      resumed_task = taskdata->td_parent; // In a serialized task, the resumed
+      // task is the parent
     }
+  } else {
+    KMP_DEBUG_ASSERT(resumed_task !=
+                     NULL); // verify that resumed task is passed as argument
+  }
+
+  /* If the tasks' destructor thunk flag has been set, we need to invoke the
+     destructor thunk that has been generated by the compiler. The code is
+     placed here, since at this point other tasks might have been released
+     hence overlapping the destructor invocations with some other work in the
+     released tasks.  The OpenMP spec is not specific on when the destructors
+     are invoked, so we should be free to choose. */
+  if (taskdata->td_flags.destructors_thunk) {
+    kmp_routine_entry_t destr_thunk = task->data1.destructors;
+    KMP_ASSERT(destr_thunk);
+    destr_thunk(gtid, task);
   }
 
   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
+
   bool detach = false;
   if (taskdata->td_flags.detachable == TASK_DETACHABLE) {
     if (taskdata->td_allow_completion_event.type ==
@@ -870,21 +913,41 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
       __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
       if (taskdata->td_allow_completion_event.type ==
           KMP_EVENT_ALLOW_COMPLETION) {
+        // task finished execution
+        KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
+        taskdata->td_flags.executing = 0; // suspend the finishing task
+
+#if OMPT_SUPPORT
+        // For a detached task, which is not completed, we switch back
+        // the omp_fulfill_event signals completion
+        // locking is necessary to avoid a race with ompt_task_late_fulfill
+        if (ompt)
+          __ompt_task_finish(task, resumed_task, ompt_task_detach);
+#endif
+
+        // no access to taskdata after this point!
+        // __kmp_fulfill_event might free taskdata at any time from now
+
         taskdata->td_flags.proxy = TASK_PROXY; // proxify!
         detach = true;
       }
       __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
     }
   }
-  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
-  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
 
   if (!detach) {
     taskdata->td_flags.complete = 1; // mark the task as completed
 
+#if OMPT_SUPPORT
+    // This is not a detached task, we are done here
+    if (ompt)
+      __ompt_task_finish(task, resumed_task, ompt_task_complete);
+#endif
+
     // Only need to keep track of count if team parallel and tasking not
-    // serialized
-    if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
+    // serialized, or task is detachable and event has already been fulfilled
+    if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
+        taskdata->td_flags.detachable == TASK_DETACHABLE) {
       // Predecrement simulated by "- 1" calculation
       children =
           KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
@@ -897,45 +960,19 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
       // with the proxy task as origin
       __kmp_release_deps(gtid, taskdata);
     }
+    // td_flags.executing must be marked as 0 after __kmp_release_deps has been
+    // called. Othertwise, if a task is executed immediately from the
+    // release_deps code, the flag will be reset to 1 again by this same
+    // function
+    KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
+    taskdata->td_flags.executing = 0; // suspend the finishing task
   }
 
-  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
-  // called. Othertwise, if a task is executed immediately from the release_deps
-  // code, the flag will be reset to 1 again by this same function
-  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
-  taskdata->td_flags.executing = 0; // suspend the finishing task
 
   KA_TRACE(
       20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
            gtid, taskdata, children));
 
-  /* If the tasks' destructor thunk flag has been set, we need to invoke the
-     destructor thunk that has been generated by the compiler. The code is
-     placed here, since at this point other tasks might have been released
-     hence overlapping the destructor invokations with some other work in the
-     released tasks.  The OpenMP spec is not specific on when the destructors
-     are invoked, so we should be free to choose. */
-  if (taskdata->td_flags.destructors_thunk) {
-    kmp_routine_entry_t destr_thunk = task->data1.destructors;
-    KMP_ASSERT(destr_thunk);
-    destr_thunk(gtid, task);
-  }
-
-  // bookkeeping for resuming task:
-  // GEH - note tasking_ser => task_serial
-  KMP_DEBUG_ASSERT(
-      (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
-      taskdata->td_flags.task_serial);
-  if (taskdata->td_flags.task_serial) {
-    if (resumed_task == NULL) {
-      resumed_task = taskdata->td_parent; // In a serialized task, the resumed
-      // task is the parent
-    }
-  } else {
-    KMP_DEBUG_ASSERT(resumed_task !=
-                     NULL); // verify that resumed task is passed as arguemnt
-  }
-
   // Free this task and then ancestor tasks if they have no children.
   // Restore th_current_task first as suggested by John:
   // johnmc: if an asynchronous inquiry peers into the runtime system
@@ -961,6 +998,7 @@ static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
                                                   kmp_task_t *task) {
   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
+  __kmp_assert_valid_gtid(gtid);
   // this routine will provide task to resume
   __kmp_task_finish<ompt>(gtid, task, NULL);
 
@@ -1075,8 +1113,15 @@ void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
     task->td_dephash = NULL;
     __kmp_push_current_task_to_thread(this_thr, team, tid);
   } else {
+#if KMP_USE_ABT
+    // [AC] We don't need to check it because we know no tasks are left now
+    task->td_incomplete_child_tasks.store(0, std::memory_order_relaxed);
+    // Not used because do not need to deallocate implicit task
+    task->td_allocated_child_tasks.store(0, std::memory_order_relaxed);
+#else
     KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
     KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
+#endif
   }
 
 #if OMPT_SUPPORT
@@ -1162,7 +1207,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   kmp_taskdata_t *parent_task = thread->th.th_current_task;
   size_t shareds_offset;
 
-  if (!TCR_4(__kmp_init_middle))
+  if (UNLIKELY(!TCR_4(__kmp_init_middle)))
     __kmp_middle_initialize();
 
   KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
@@ -1290,6 +1335,11 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   taskdata->td_flags.detachable = flags->detachable;
   taskdata->td_task_team = thread->th.th_task_team;
   taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
+#if KMP_USE_ABT
+  taskdata->td_task_queue = NULL;
+  taskdata->td_tq_cur_size = 0;
+  taskdata->td_tq_max_size = 0;
+#endif
   taskdata->td_flags.tasktype = TASK_EXPLICIT;
 
   // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
@@ -1304,7 +1354,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
 
   taskdata->td_flags.task_serial =
       (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
-       taskdata->td_flags.tasking_ser);
+       taskdata->td_flags.tasking_ser || flags->merged_if0);
 
   taskdata->td_flags.started = 0;
   taskdata->td_flags.executing = 0;
@@ -1358,10 +1408,9 @@ kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
                                   kmp_routine_entry_t task_entry) {
   kmp_task_t *retval;
   kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
-
+  __kmp_assert_valid_gtid(gtid);
   input_flags->native = FALSE;
 // __kmp_task_alloc() sets up all other runtime flags
-
   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
@@ -1411,7 +1460,7 @@ __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
 //
 // gtid: global thread ID of caller
 // task: the task to invoke
-// current_task: the task to resume after task invokation
+// current_task: the task to resume after task invocation
 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
                               kmp_taskdata_t *current_task) {
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
@@ -1421,8 +1470,8 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
       30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
            gtid, taskdata, current_task));
   KMP_DEBUG_ASSERT(task);
-  if (taskdata->td_flags.proxy == TASK_PROXY &&
-      taskdata->td_flags.complete == 1) {
+  if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
+               taskdata->td_flags.complete == 1)) {
     // This is a proxy task that was already completed but it needs to run
     // its bottom-half finish
     KA_TRACE(
@@ -1464,7 +1513,7 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
   // TODO: cancel tasks if the parallel region has also been cancelled
   // TODO: check if this sequence can be hoisted above __kmp_task_start
   // if cancellation has been enabled for this run ...
-  if (__kmp_omp_cancellation) {
+  if (UNLIKELY(__kmp_omp_cancellation)) {
     thread = __kmp_threads[gtid];
     kmp_team_t *this_team = thread->th.th_team;
     kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
@@ -1538,6 +1587,7 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
       else
         kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
     }
+    KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
 #endif
 
 #ifdef KMP_GOMP_COMPAT
@@ -1555,11 +1605,12 @@ static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
       // Barrier imbalance - adjust arrive time with the task duration
       thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
     }
+    KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
+    KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
 #endif
 
   }
 
-
   // Proxy tasks are not handled by the runtime
   if (taskdata->td_flags.proxy != TASK_PROXY) {
     ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
@@ -1691,6 +1742,7 @@ kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
 #endif
   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
                 new_taskdata));
+  __kmp_assert_valid_gtid(gtid);
 
 #if OMPT_SUPPORT
   kmp_taskdata_t *parent = NULL;
@@ -1793,12 +1845,29 @@ template <bool ompt>
 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
                                               void *frame_address,
                                               void *return_address) {
+#if !KMP_USE_ABT
   kmp_taskdata_t *taskdata;
+#endif
   kmp_info_t *thread;
+#if !KMP_USE_ABT
   int thread_finished = FALSE;
+#endif
   KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
 
   KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
+  __kmp_assert_valid_gtid(gtid);
+
+#if KMP_USE_ABT
+
+  thread = __kmp_threads[gtid];
+  __kmp_abt_wait_child_tasks(thread, true, TRUE);
+
+  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d finished waiting, "
+                "returning TASK_CURRENT_NOT_QUEUED\n", gtid));
+
+  return TASK_CURRENT_NOT_QUEUED;
+
+#else // KMP_USE_ABT
 
   if (__kmp_tasking_mode != tskm_immediate_exec) {
     thread = __kmp_threads[gtid];
@@ -1839,7 +1908,7 @@ static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
 
 #if USE_ITT_BUILD
     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
-    if (itt_sync_obj != NULL)
+    if (UNLIKELY(itt_sync_obj != NULL))
       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
 #endif /* USE_ITT_BUILD */
 
@@ -1849,9 +1918,10 @@ static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
     must_wait = must_wait || (thread->th.th_task_team != NULL &&
                               thread->th.th_task_team->tt.tt_found_proxy_tasks);
     if (must_wait) {
-      kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
-                             &(taskdata->td_incomplete_child_tasks)),
-                       0U);
+      kmp_flag_32<false, false> flag(
+          RCAST(std::atomic<kmp_uint32> *,
+                &(taskdata->td_incomplete_child_tasks)),
+          0U);
       while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
         flag.execute_tasks(thread, gtid, FALSE,
                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
@@ -1859,8 +1929,9 @@ static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
       }
     }
 #if USE_ITT_BUILD
-    if (itt_sync_obj != NULL)
+    if (UNLIKELY(itt_sync_obj != NULL))
       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
+    KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
 #endif /* USE_ITT_BUILD */
 
     // Debugger:  The taskwait is completed. Location remains, but thread is
@@ -1891,6 +1962,7 @@ static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
                 gtid, taskdata));
 
   return TASK_CURRENT_NOT_QUEUED;
+#endif // !KMP_USE_ABT
 }
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
@@ -1920,13 +1992,35 @@ kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
   kmp_taskdata_t *taskdata;
   kmp_info_t *thread;
+#if !KMP_USE_ABT
   int thread_finished = FALSE;
+#endif
 
   KMP_COUNT_BLOCK(OMP_TASKYIELD);
   KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
 
+#if KMP_USE_ABT
+
+  thread = __kmp_threads[gtid];
+  taskdata = thread->th.th_current_task;
+  // Let others, e.g., tasks, can use this kmp_info.
+  // Get the associated team before releasing the ownership of th.
+  kmp_team_t *team = thread->th.th_team;
+  __kmp_abt_release_info(thread);
+  // In a taskyield directive we just do it... yield
+  __kmp_yield();
+  if (taskdata->td_flags.tiedness) {
+    // Obtain kmp_info to continue the original task.
+    __kmp_abt_acquire_info_for_task(thread, taskdata, team);
+  } else {
+    thread = __kmp_abt_bind_task_to_thread(team, taskdata);
+  }
+
+#else // KMP_USE_ABT
+
   KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
                 gtid, loc_ref, end_part));
+  __kmp_assert_valid_gtid(gtid);
 
   if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
     thread = __kmp_threads[gtid];
@@ -1943,7 +2037,7 @@ kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
 
 #if USE_ITT_BUILD
     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
-    if (itt_sync_obj != NULL)
+    if (UNLIKELY(itt_sync_obj != NULL))
       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
 #endif /* USE_ITT_BUILD */
     if (!taskdata->td_flags.team_serial) {
@@ -1955,7 +2049,7 @@ kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
             thread->th.ompt_thread_info.ompt_task_yielded = 1;
 #endif
           __kmp_execute_tasks_32(
-              thread, gtid, NULL, FALSE,
+              thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
               &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
               __kmp_task_stealing_constraint);
 #if OMPT_SUPPORT
@@ -1966,7 +2060,7 @@ kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
       }
     }
 #if USE_ITT_BUILD
-    if (itt_sync_obj != NULL)
+    if (UNLIKELY(itt_sync_obj != NULL))
       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
 #endif /* USE_ITT_BUILD */
 
@@ -1975,6 +2069,8 @@ kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
   }
 
+#endif // !KMP_USE_ABT
+
   KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
                 "returning TASK_CURRENT_NOT_QUEUED\n",
                 gtid, taskdata));
@@ -2067,24 +2163,25 @@ void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
   } // non-NULL reduce_orig means new interface used
 }
 
-template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, int j);
+template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
 template <>
 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
-                                           int offset) {
+                                           size_t offset) {
   ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
 }
 template <>
 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
-                                          int offset) {
+                                          size_t offset) {
   ((void (*)(void *, void *))item.reduce_init)(
       (char *)(item.reduce_priv) + offset, item.reduce_orig);
 }
 
 template <typename T>
 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
+  __kmp_assert_valid_gtid(gtid);
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
-  kmp_int32 nth = thread->th.th_team_nproc;
+  kmp_uint32 nth = thread->th.th_team_nproc;
   kmp_taskred_data_t *arr;
 
   // check input data just in case
@@ -2118,7 +2215,7 @@ void *__kmp_task_reduction_init(int gtid, int num, T *data) {
       arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
       if (arr[i].reduce_init != NULL) {
         // initialize all thread-specific items
-        for (int j = 0; j < nth; ++j) {
+        for (size_t j = 0; j < nth; ++j) {
           __kmp_call_init<T>(arr[i], j * size);
         }
       }
@@ -2197,6 +2294,7 @@ void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
 Get thread-specific location of data item
 */
 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
+  __kmp_assert_valid_gtid(gtid);
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_int32 nth = thread->th.th_team_nproc;
   if (nth == 1)
@@ -2302,6 +2400,7 @@ static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
 template <typename T>
 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
                                          int num, T *data) {
+  __kmp_assert_valid_gtid(gtid);
   kmp_info_t *thr = __kmp_threads[gtid];
   kmp_int32 nth = thr->th.th_team_nproc;
   __kmpc_taskgroup(loc, gtid); // form new taskgroup first
@@ -2397,6 +2496,7 @@ void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
 
 // __kmpc_taskgroup: Start a new taskgroup
 void __kmpc_taskgroup(ident_t *loc, int gtid) {
+  __kmp_assert_valid_gtid(gtid);
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *taskdata = thread->th.th_current_task;
   kmp_taskgroup_t *tg_new =
@@ -2429,10 +2529,13 @@ void __kmpc_taskgroup(ident_t *loc, int gtid) {
 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
 //                       and its descendants are complete
 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
+  __kmp_assert_valid_gtid(gtid);
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *taskdata = thread->th.th_current_task;
   kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
+#if !KMP_USE_ABT
   int thread_finished = FALSE;
+#endif
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   kmp_team_t *team;
@@ -2463,7 +2566,7 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
     // For ITT the taskgroup wait is similar to taskwait until we need to
     // distinguish them
     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
-    if (itt_sync_obj != NULL)
+    if (UNLIKELY(itt_sync_obj != NULL))
       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
 #endif /* USE_ITT_BUILD */
 
@@ -2478,13 +2581,24 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
     if (!taskdata->td_flags.team_serial ||
         (thread->th.th_task_team != NULL &&
          thread->th.th_task_team->tt.tt_found_proxy_tasks)) {
-      kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)),
-                       0U);
+#if KMP_USE_ABT
+      __kmp_abt_wait_child_tasks(thread, true, 0);
+      // Since BOLT manages tasks by task queue owned by every task,
+      // taskgroup->count is not modified at the end of tasks.
+      // FIXME: it assumes parent-child relationship between parent tasks and
+      // descendant tasks, while the dependency should be more relaxed.
+      // For example, taskwait only needs to wait for children, not the all
+      // descendants.
+      taskgroup->count = 0;
+#else // KMP_USE_ABT
+      kmp_flag_32<false, false> flag(
+          RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
       while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
         flag.execute_tasks(thread, gtid, FALSE,
                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
                            __kmp_task_stealing_constraint);
       }
+#endif // !KMP_USE_ABT
     }
     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
 
@@ -2497,8 +2611,9 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
 #endif
 
 #if USE_ITT_BUILD
-    if (itt_sync_obj != NULL)
+    if (UNLIKELY(itt_sync_obj != NULL))
       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
+    KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
 #endif /* USE_ITT_BUILD */
   }
   KMP_DEBUG_ASSERT(taskgroup->count == 0);
@@ -2568,6 +2683,8 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
 #endif
 }
 
+#if !KMP_USE_ABT
+
 // __kmp_remove_my_task: remove a task from my own deque
 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
                                         kmp_task_team_t *task_team,
@@ -2911,7 +3028,7 @@ static inline int __kmp_execute_tasks_template(
       // met, then return now, so that the barrier gather/release pattern can
       // proceed. If this thread is in the last spin loop in the barrier,
       // waiting to be released, we know that the termination condition will not
-      // be satisified, so don't waste any cycles checking it.
+      // be satisfied, so don't waste any cycles checking it.
       if (flag == NULL || (!final_spin && flag->done_check())) {
         KA_TRACE(
             15,
@@ -2986,8 +3103,9 @@ static inline int __kmp_execute_tasks_template(
   }
 }
 
+template <bool C, bool S>
 int __kmp_execute_tasks_32(
-    kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
+    kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
     kmp_int32 is_constrained) {
   return __kmp_execute_tasks_template(
@@ -2995,8 +3113,9 @@ int __kmp_execute_tasks_32(
       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
 }
 
+template <bool C, bool S>
 int __kmp_execute_tasks_64(
-    kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
+    kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
     kmp_int32 is_constrained) {
   return __kmp_execute_tasks_template(
@@ -3013,6 +3132,24 @@ int __kmp_execute_tasks_oncore(
       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
 }
 
+template int
+__kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
+                                     kmp_flag_32<false, false> *, int,
+                                     int *USE_ITT_BUILD_ARG(void *), kmp_int32);
+
+template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
+                                                 kmp_flag_64<false, true> *,
+                                                 int,
+                                                 int *USE_ITT_BUILD_ARG(void *),
+                                                 kmp_int32);
+
+template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
+                                                 kmp_flag_64<true, false> *,
+                                                 int,
+                                                 int *USE_ITT_BUILD_ARG(void *),
+                                                 kmp_int32);
+#endif // !KMP_USE_ABT
+
 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
 // next barrier so they can assist in executing enqueued tasks.
 // First thread in allocates the task team atomically.
@@ -3045,6 +3182,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team,
   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
   KMP_DEBUG_ASSERT(threads_data != NULL);
 
+#if !KMP_USE_ABT
   if (__kmp_tasking_mode == tskm_task_teams &&
       (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
     // Release any threads sleeping at the barrier, so that they can steal
@@ -3076,6 +3214,9 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team,
       }
     }
   }
+#else
+  (void)i; // Suppress an unused warning
+#endif // !KMP_USE_ABT
 
   KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
                 __kmp_gtid_from_thread(this_thr)));
@@ -3090,13 +3231,13 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team,
  * master thread may exit the barrier code and free the team data structure,
  * and return the threads to the thread pool).
  *
- * This does not work with the the tasking code, as the thread is still
+ * This does not work with the tasking code, as the thread is still
  * expected to participate in the execution of any tasks that may have been
  * spawned my a member of the team, and the thread still needs access to all
  * to each thread in the team, so that it can steal work from it.
  *
  * Enter the existence of the kmp_task_team_t struct.  It employs a reference
- * counting mechanims, and is allocated by the master thread before calling
+ * counting mechanism, and is allocated by the master thread before calling
  * __kmp_<barrier_kind>_release, and then is release by the last thread to
  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
  * of the kmp_task_team_t structs for consecutive barriers can overlap
@@ -3107,7 +3248,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team,
  * We currently use the existence of the threads array as an indicator that
  * tasks were spawned since the last barrier.  If the structure is to be
  * useful outside the context of tasking, then this will have to change, but
- * not settting the field minimizes the performance impact of tasking on
+ * not setting the field minimizes the performance impact of tasking on
  * barriers, when no explicit tasks were spawned (pushed, actually).
  */
 
@@ -3169,7 +3310,7 @@ static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
 // __kmp_realloc_task_threads_data:
 // Allocates a threads_data array for a task team, either by allocating an
 // initial array or enlarging an existing array.  Only the first thread to get
-// the lock allocs or enlarges the array and re-initializes the array eleemnts.
+// the lock allocs or enlarges the array and re-initializes the array elements.
 // That thread returns "TRUE", the rest return "FALSE".
 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
 // The current size is given by task_team -> tt.tt_max_threads.
@@ -3319,15 +3460,25 @@ static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
     KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
                   "task team for team %p\n",
                   __kmp_gtid_from_thread(thread), team));
-    // Allocate a new task team if one is not available.
-    // Cannot use __kmp_thread_malloc() because threads not around for
-    // kmp_reap_task_team( ).
+    // Allocate a new task team if one is not available. Cannot use
+    // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
     task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
     __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
-    // AC: __kmp_allocate zeroes returned memory
-    // task_team -> tt.tt_threads_data = NULL;
-    // task_team -> tt.tt_max_threads = 0;
-    // task_team -> tt.tt_next = NULL;
+#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
+    // suppress race conditions detection on synchronization flags in debug mode
+    // this helps to analyze library internals eliminating false positives
+    __itt_suppress_mark_range(
+        __itt_suppress_range, __itt_suppress_threading_errors,
+        &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
+    __itt_suppress_mark_range(__itt_suppress_range,
+                              __itt_suppress_threading_errors,
+                              CCAST(kmp_uint32 *, &task_team->tt.tt_active),
+                              sizeof(task_team->tt.tt_active));
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
+    // Note: __kmp_allocate zeroes returned memory, othewise we would need:
+    // task_team->tt.tt_threads_data = NULL;
+    // task_team->tt.tt_max_threads = 0;
+    // task_team->tt.tt_next = NULL;
   }
 
   TCW_4(task_team->tt.tt_found_tasks, FALSE);
@@ -3426,6 +3577,7 @@ void __kmp_wait_to_unref_task_teams(void) {
                     "unreference task_team\n",
                     __kmp_gtid_from_thread(thread)));
 
+#if !KMP_USE_ABT
       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
         volatile void *sleep_loc;
         // If the thread is sleeping, awaken it.
@@ -3438,6 +3590,7 @@ void __kmp_wait_to_unref_task_teams(void) {
           __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
         }
       }
+#endif // !KMP_USE_ABT
     }
     if (done) {
       break;
@@ -3517,7 +3670,8 @@ void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
 
   // Toggle the th_task_state field, to switch which task_team this thread
   // refers to
-  this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
+  this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
+
   // It is now safe to propagate the task team pointer from the team struct to
   // the current thread.
   TCW_PTR(this_thr->th.th_task_team,
@@ -3549,13 +3703,18 @@ void __kmp_task_team_wait(
       KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks "
                     "(for unfinished_threads to reach 0) on task_team = %p\n",
                     __kmp_gtid_from_thread(this_thr), task_team));
+#if KMP_USE_ABT
+      KMP_DEBUG_ASSERT(wait == 0);
+#else
       // Worker threads may have dropped through to release phase, but could
       // still be executing tasks. Wait here for tasks to complete. To avoid
       // memory contention, only master thread checks termination condition.
-      kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
-                             &task_team->tt.tt_unfinished_threads),
-                       0U);
+      kmp_flag_32<false, false> flag(
+          RCAST(std::atomic<kmp_uint32> *,
+                &task_team->tt.tt_unfinished_threads),
+          0U);
       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+#endif
     }
     // Deactivate the old task team, so that the worker threads will stop
     // referencing it while spinning.
@@ -3576,11 +3735,12 @@ void __kmp_task_team_wait(
 }
 
 // __kmp_tasking_barrier:
-// This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
+// This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
 // Internal function to execute all tasks prior to a regular barrier or a join
 // barrier. It is a full barrier itself, which unfortunately turns regular
 // barriers into double barriers and join barriers into 1 1/2 barriers.
 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
+#if !KMP_USE_ABT
   std::atomic<kmp_uint32> *spin = RCAST(
       std::atomic<kmp_uint32> *,
       &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
@@ -3590,7 +3750,7 @@ void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
 #if USE_ITT_BUILD
   KMP_FSYNC_SPIN_INIT(spin, NULL);
 #endif /* USE_ITT_BUILD */
-  kmp_flag_32 spin_flag(spin, 0U);
+  kmp_flag_32<false, false> spin_flag(spin, 0U);
   while (!spin_flag.execute_tasks(thread, gtid, TRUE,
                                   &flag USE_ITT_BUILD_ARG(NULL), 0)) {
 #if USE_ITT_BUILD
@@ -3608,6 +3768,7 @@ void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
 #if USE_ITT_BUILD
   KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
 #endif /* USE_ITT_BUILD */
+#endif // !KMP_USE_ABT
 }
 
 // __kmp_give_task puts a task into a given thread queue if:
@@ -3651,7 +3812,11 @@ static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
       return result;
 
     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
-    __kmp_realloc_task_deque(thread, thread_data);
+    if (TCR_4(thread_data->td.td_deque_ntasks) >=
+        TASK_DEQUE_SIZE(thread_data->td)) {
+      // expand deque to push the task which is not allowed to execute
+      __kmp_realloc_task_deque(thread, thread_data);
+    }
 
   } else {
 
@@ -3757,7 +3922,7 @@ static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
 @param gtid Global Thread ID of encountering thread
 @param ptask Task which execution is completed
 
-Execute the completation of a proxy task from a thread of that is part of the
+Execute the completion of a proxy task from a thread of that is part of the
 team. Run first and bottom halves directly.
 */
 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
@@ -3766,7 +3931,7 @@ void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
   KA_TRACE(
       10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
            gtid, taskdata));
-
+  __kmp_assert_valid_gtid(gtid);
   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
 
   __kmp_first_top_half_finish_proxy(taskdata);
@@ -3782,7 +3947,7 @@ void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
 @ingroup TASKING
 @param ptask Task which execution is completed
 
-Execute the completation of a proxy task from a thread that could not belong to
+Execute the completion of a proxy task from a thread that could not belong to
 the team.
 */
 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
@@ -3847,22 +4012,30 @@ void __kmp_fulfill_event(kmp_event_t *event) {
     bool detached = false;
     int gtid = __kmp_get_gtid();
 
+    // The associated task might have completed or could be completing at this
+    // point.
+    // We need to take the lock to avoid races
+    __kmp_acquire_tas_lock(&event->lock, gtid);
     if (taskdata->td_flags.proxy == TASK_PROXY) {
-      // The associated task code completed before this call and detached.
       detached = true;
-      event->type = KMP_EVENT_UNINITIALIZED;
     } else {
-      // The associated task has not completed but could be completing at this
-      // point.
-      // We need to take the lock to avoid races
-      __kmp_acquire_tas_lock(&event->lock, gtid);
-      if (taskdata->td_flags.proxy == TASK_PROXY)
-        detached = true;
-      event->type = KMP_EVENT_UNINITIALIZED;
-      __kmp_release_tas_lock(&event->lock, gtid);
+#if OMPT_SUPPORT
+      // The OMPT event must occur under mutual exclusion,
+      // otherwise the tool might access ptask after free
+      if (UNLIKELY(ompt_enabled.enabled))
+        __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
+#endif
     }
+    event->type = KMP_EVENT_UNINITIALIZED;
+    __kmp_release_tas_lock(&event->lock, gtid);
 
     if (detached) {
+#if OMPT_SUPPORT
+      // We free ptask afterwards and know the task is finished,
+      // so locking is not necessary
+      if (UNLIKELY(ompt_enabled.enabled))
+        __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
+#endif
       // If the task detached complete the proxy task
       if (gtid >= 0) {
         kmp_team_t *team = taskdata->td_team;
@@ -3888,14 +4061,13 @@ void __kmp_fulfill_event(kmp_event_t *event) {
 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
   kmp_task_t *task;
   kmp_taskdata_t *taskdata;
-  kmp_taskdata_t *taskdata_src;
-  kmp_taskdata_t *parent_task = thread->th.th_current_task;
+  kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
+  kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
   size_t shareds_offset;
   size_t task_size;
 
   KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
                 task_src));
-  taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
   KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
                    TASK_FULL); // it should not be proxy task
   KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
@@ -3923,9 +4095,12 @@ kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
   }
   taskdata->td_alloc_thread = thread;
   taskdata->td_parent = parent_task;
-  taskdata->td_taskgroup =
-      parent_task
-          ->td_taskgroup; // task inherits the taskgroup from the parent task
+  // task inherits the taskgroup from the parent task
+  taskdata->td_taskgroup = parent_task->td_taskgroup;
+  // tied task needs to initialize the td_last_tied at creation,
+  // untied one does this when it is scheduled for execution
+  if (taskdata->td_flags.tiedness == TASK_TIED)
+    taskdata->td_last_tied = taskdata;
 
   // Only need to keep track of child task counts if team parallel and tasking
   // not serialized
@@ -4073,6 +4248,7 @@ class kmp_taskloop_bounds_t {
 // num_tasks  Number of tasks to execute
 // grainsize  Number of loop iterations per task
 // extras     Number of chunks with grainsize+1 iterations
+// last_chunk Reduction of grainsize for last task
 // tc         Iterations count
 // task_dup   Tasks duplication routine
 // codeptr_ra Return address for OMPT events
@@ -4080,7 +4256,7 @@ void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
                            kmp_uint64 ub_glob, kmp_uint64 num_tasks,
                            kmp_uint64 grainsize, kmp_uint64 extras,
-                           kmp_uint64 tc,
+                           kmp_int64 last_chunk, kmp_uint64 tc,
 #if OMPT_SUPPORT
                            void *codeptr_ra,
 #endif
@@ -4098,13 +4274,14 @@ void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
   kmp_task_t *next_task;
   kmp_int32 lastpriv = 0;
 
-  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
+  KMP_DEBUG_ASSERT(
+      tc == num_tasks * grainsize + (last_chunk < 0 ? last_chunk : extras));
   KMP_DEBUG_ASSERT(num_tasks > extras);
   KMP_DEBUG_ASSERT(num_tasks > 0);
   KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
-                "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n",
-                gtid, num_tasks, grainsize, extras, lower, upper, ub_glob, st,
-                task_dup));
+                "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
+                gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
+                ub_glob, st, task_dup));
 
   // Launch num_tasks tasks, assign grainsize iterations each task
   for (i = 0; i < num_tasks; ++i) {
@@ -4116,6 +4293,9 @@ void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
       --extras; // first extras iterations get bigger chunk (grainsize+1)
     }
     upper = lower + st * chunk_minus_1;
+    if (upper > *ub) {
+      upper = *ub;
+    }
     if (i == num_tasks - 1) {
       // schedule the last task, set lastprivate flag if needed
       if (st == 1) { // most common case
@@ -4144,7 +4324,8 @@ void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
     } else {
       next_task_bounds.set_ub(upper);
     }
-    if (ptask_dup != NULL) // set lastprivate flag, construct fistprivates, etc.
+    if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
+                           // etc.
       ptask_dup(next_task, task, lastpriv);
     KA_TRACE(40,
              ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
@@ -4178,6 +4359,7 @@ typedef struct __taskloop_params {
   kmp_uint64 num_tasks;
   kmp_uint64 grainsize;
   kmp_uint64 extras;
+  kmp_int64 last_chunk;
   kmp_uint64 tc;
   kmp_uint64 num_t_min;
 #if OMPT_SUPPORT
@@ -4187,13 +4369,14 @@ typedef struct __taskloop_params {
 
 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
                           kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
-                          kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64,
+                          kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
+                          kmp_uint64,
 #if OMPT_SUPPORT
                           void *,
 #endif
                           void *);
 
-// Execute part of the the taskloop submitted as a task.
+// Execute part of the taskloop submitted as a task.
 int __kmp_taskloop_task(int gtid, void *ptask) {
   __taskloop_params_t *p =
       (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
@@ -4207,6 +4390,7 @@ int __kmp_taskloop_task(int gtid, void *ptask) {
   kmp_uint64 num_tasks = p->num_tasks;
   kmp_uint64 grainsize = p->grainsize;
   kmp_uint64 extras = p->extras;
+  kmp_int64 last_chunk = p->last_chunk;
   kmp_uint64 tc = p->tc;
   kmp_uint64 num_t_min = p->num_t_min;
 #if OMPT_SUPPORT
@@ -4215,22 +4399,23 @@ int __kmp_taskloop_task(int gtid, void *ptask) {
 #if KMP_DEBUG
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
   KMP_DEBUG_ASSERT(task != NULL);
-  KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
-                " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
-                gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
-                task_dup));
+  KA_TRACE(20,
+           ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
+            " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
+            gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
+            st, task_dup));
 #endif
   KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
   if (num_tasks > num_t_min)
     __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
-                         grainsize, extras, tc, num_t_min,
+                         grainsize, extras, last_chunk, tc, num_t_min,
 #if OMPT_SUPPORT
                          codeptr_ra,
 #endif
                          task_dup);
   else
     __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
-                          grainsize, extras, tc,
+                          grainsize, extras, last_chunk, tc,
 #if OMPT_SUPPORT
                           codeptr_ra,
 #endif
@@ -4240,8 +4425,8 @@ int __kmp_taskloop_task(int gtid, void *ptask) {
   return 0;
 }
 
-// Schedule part of the the taskloop as a task,
-// execute the rest of the the taskloop.
+// Schedule part of the taskloop as a task,
+// execute the rest of the taskloop.
 //
 // loc        Source location information
 // gtid       Global thread ID
@@ -4253,28 +4438,29 @@ int __kmp_taskloop_task(int gtid, void *ptask) {
 // num_tasks  Number of tasks to execute
 // grainsize  Number of loop iterations per task
 // extras     Number of chunks with grainsize+1 iterations
+// last_chunk Reduction of grainsize for last task
 // tc         Iterations count
-// num_t_min  Threashold to launch tasks recursively
+// num_t_min  Threshold to launch tasks recursively
 // task_dup   Tasks duplication routine
 // codeptr_ra Return address for OMPT events
 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,
                           kmp_uint64 grainsize, kmp_uint64 extras,
-                          kmp_uint64 tc, kmp_uint64 num_t_min,
+                          kmp_int64 last_chunk, kmp_uint64 tc,
+                          kmp_uint64 num_t_min,
 #if OMPT_SUPPORT
                           void *codeptr_ra,
 #endif
                           void *task_dup) {
-#if KMP_DEBUG
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
   KMP_DEBUG_ASSERT(task != NULL);
   KMP_DEBUG_ASSERT(num_tasks > num_t_min);
-  KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
-                " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
-                gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
-                task_dup));
-#endif
+  KA_TRACE(20,
+           ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
+            " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
+            gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
+            st, task_dup));
   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
   kmp_uint64 lower = *lb;
   kmp_info_t *thread = __kmp_threads[gtid];
@@ -4285,16 +4471,23 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
   size_t upper_offset =
       (char *)ub - (char *)task; // remember offset of ub in the task structure
 
-  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
+  KMP_DEBUG_ASSERT(
+      tc == num_tasks * grainsize + (last_chunk < 0 ? last_chunk : extras));
   KMP_DEBUG_ASSERT(num_tasks > extras);
   KMP_DEBUG_ASSERT(num_tasks > 0);
 
   // split the loop in two halves
   kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
+  kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
   kmp_uint64 gr_size0 = grainsize;
   kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
   kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
-  if (n_tsk0 <= extras) {
+  if (last_chunk < 0) {
+    ext0 = ext1 = 0;
+    last_chunk1 = last_chunk;
+    tc0 = grainsize * n_tsk0;
+    tc1 = tc - tc0;
+  } else if (n_tsk0 <= extras) {
     gr_size0++; // integrate extras into grainsize
     ext0 = 0; // no extra iters in 1st half
     ext1 = extras - n_tsk0; // remaining extras
@@ -4313,14 +4506,19 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
   next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
   // adjust lower bound (upper bound is not changed) for the 2nd half
   *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
-  if (ptask_dup != NULL) // construct fistprivates, etc.
+  if (ptask_dup != NULL) // construct firstprivates, etc.
     ptask_dup(next_task, task, 0);
   *ub = ub0; // adjust upper bound for the 1st half
 
   // create auxiliary task for 2nd half of the loop
+  // make sure new task has same parent task as the pattern task
+  kmp_taskdata_t *current_task = thread->th.th_current_task;
+  thread->th.th_current_task = taskdata->td_parent;
   kmp_task_t *new_task =
       __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
                             sizeof(__taskloop_params_t), &__kmp_taskloop_task);
+  // restore current task
+  thread->th.th_current_task = current_task;
   __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
   p->task = next_task;
   p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
@@ -4331,6 +4529,7 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
   p->num_tasks = n_tsk1;
   p->grainsize = grainsize;
   p->extras = ext1;
+  p->last_chunk = last_chunk1;
   p->tc = tc1;
   p->num_t_min = num_t_min;
 #if OMPT_SUPPORT
@@ -4347,44 +4546,28 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
   // execute the 1st half of current subrange
   if (n_tsk0 > num_t_min)
     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
-                         ext0, tc0, num_t_min,
+                         ext0, last_chunk0, tc0, num_t_min,
 #if OMPT_SUPPORT
                          codeptr_ra,
 #endif
                          task_dup);
   else
     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
-                          gr_size0, ext0, tc0,
+                          gr_size0, ext0, last_chunk0, tc0,
 #if OMPT_SUPPORT
                           codeptr_ra,
 #endif
                           task_dup);
 
-  KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid));
+  KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
 }
 
-/*!
-@ingroup TASKING
-@param loc       Source location information
-@param gtid      Global thread ID
-@param task      Task structure
-@param if_val    Value of the if clause
-@param lb        Pointer to loop lower bound in task structure
-@param ub        Pointer to loop upper bound in task structure
-@param st        Loop stride
-@param nogroup   Flag, 1 if no taskgroup needs to be added, 0 otherwise
-@param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
-@param grainsize Schedule value if specified
-@param task_dup  Tasks duplication routine
-
-Execute the taskloop construct.
-*/
-void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
-                     kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
-                     int sched, kmp_uint64 grainsize, void *task_dup) {
+static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
+                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                           int nogroup, int sched, kmp_uint64 grainsize,
+                           int modifier, void *task_dup) {
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
   KMP_DEBUG_ASSERT(task != NULL);
-
   if (nogroup == 0) {
 #if OMPT_SUPPORT && OMPT_OPTIONAL
     OMPT_STORE_RETURN_ADDRESS(gtid);
@@ -4401,13 +4584,16 @@ void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
   kmp_uint64 upper = task_bounds.get_ub();
   kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
   kmp_uint64 num_tasks = 0, extras = 0;
+  kmp_int64 last_chunk =
+      0; // reduce grainsize of last task by last_chunk in strict mode
   kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *current_task = thread->th.th_current_task;
 
-  KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
-                "grain %llu(%d), dup %p\n",
-                gtid, taskdata, lower, upper, st, grainsize, sched, task_dup));
+  KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
+                "grain %llu(%d, %d), dup %p\n",
+                gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
+                task_dup));
 
   // compute trip count
   if (st == 1) { // most common case
@@ -4418,7 +4604,7 @@ void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
     tc = (upper - lower) / st + 1;
   }
   if (tc == 0) {
-    KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid));
+    KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
     // free the pattern task and exit
     __kmp_task_start(gtid, task, current_task);
     // do not execute anything for zero-trip loop
@@ -4460,20 +4646,28 @@ void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
     break;
   case 1: // grainsize provided
     if (grainsize > tc) {
-      num_tasks = 1; // too big grainsize requested, adjust values
-      grainsize = tc;
+      num_tasks = 1;
+      grainsize = tc; // too big grainsize requested, adjust values
       extras = 0;
     } else {
-      num_tasks = tc / grainsize;
-      // adjust grainsize for balanced distribution of iterations
-      grainsize = tc / num_tasks;
-      extras = tc % num_tasks;
+      if (modifier) {
+        num_tasks = (tc + grainsize - 1) / grainsize;
+        last_chunk = tc - (num_tasks * grainsize);
+        extras = 0;
+      } else {
+        num_tasks = tc / grainsize;
+        // adjust grainsize for balanced distribution of iterations
+        grainsize = tc / num_tasks;
+        extras = tc % num_tasks;
+      }
     }
     break;
   default:
     KMP_ASSERT2(0, "unknown scheduling of taskloop");
   }
-  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
+
+  KMP_DEBUG_ASSERT(
+      tc == num_tasks * grainsize + (last_chunk < 0 ? last_chunk : extras));
   KMP_DEBUG_ASSERT(num_tasks > extras);
   KMP_DEBUG_ASSERT(num_tasks > 0);
   // =========================================================================
@@ -4485,7 +4679,7 @@ void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
     // always start serial tasks linearly
     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
-                          grainsize, extras, tc,
+                          grainsize, extras, last_chunk, tc,
 #if OMPT_SUPPORT
                           OMPT_GET_RETURN_ADDRESS(0),
 #endif
@@ -4493,21 +4687,23 @@ void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
     // !taskdata->td_flags.native => currently force linear spawning of tasks
     // for GOMP_taskloop
   } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
-    KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
-                  "(%lld), grain %llu, extras %llu\n",
-                  gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
+    KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
+                  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
+                  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
+                  last_chunk));
     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
-                         grainsize, extras, tc, num_tasks_min,
+                         grainsize, extras, last_chunk, tc, num_tasks_min,
 #if OMPT_SUPPORT
                          OMPT_GET_RETURN_ADDRESS(0),
 #endif
                          task_dup);
   } else {
-    KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
-                  "(%lld), grain %llu, extras %llu\n",
-                  gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
+    KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
+                  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
+                  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
+                  last_chunk));
     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
-                          grainsize, extras, tc,
+                          grainsize, extras, last_chunk, tc,
 #if OMPT_SUPPORT
                           OMPT_GET_RETURN_ADDRESS(0),
 #endif
@@ -4528,5 +4724,59 @@ void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
 #endif
     __kmpc_end_taskgroup(loc, gtid);
   }
+  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
+}
+
+/*!
+@ingroup TASKING
+@param loc       Source location information
+@param gtid      Global thread ID
+@param task      Task structure
+@param if_val    Value of the if clause
+@param lb        Pointer to loop lower bound in task structure
+@param ub        Pointer to loop upper bound in task structure
+@param st        Loop stride
+@param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
+@param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
+@param grainsize Schedule value if specified
+@param task_dup  Tasks duplication routine
+
+Execute the taskloop construct.
+*/
+void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
+                     kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
+                     int sched, kmp_uint64 grainsize, void *task_dup) {
+  __kmp_assert_valid_gtid(gtid);
+  KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
+  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
+                 0, task_dup);
   KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
 }
+
+/*!
+@ingroup TASKING
+@param loc       Source location information
+@param gtid      Global thread ID
+@param task      Task structure
+@param if_val    Value of the if clause
+@param lb        Pointer to loop lower bound in task structure
+@param ub        Pointer to loop upper bound in task structure
+@param st        Loop stride
+@param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
+@param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
+@param grainsize Schedule value if specified
+@param modifer   Modifier 'strict' for sched, 1 if present, 0 otherwise
+@param task_dup  Tasks duplication routine
+
+Execute the taskloop construct.
+*/
+void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
+                       kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                       int nogroup, int sched, kmp_uint64 grainsize,
+                       int modifier, void *task_dup) {
+  __kmp_assert_valid_gtid(gtid);
+  KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
+  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
+                 modifier, task_dup);
+  KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
+}
diff --git a/runtime/src/kmp_threadprivate.cpp b/runtime/src/kmp_threadprivate.cpp
index 87bfff39e..270c90abf 100644
--- a/runtime/src/kmp_threadprivate.cpp
+++ b/runtime/src/kmp_threadprivate.cpp
@@ -113,10 +113,9 @@ static struct private_data *__kmp_init_common_data(void *pc_addr,
 // Initialize the data area from the template.
 static void __kmp_copy_common_data(void *pc_addr, struct private_data *d) {
   char *addr = (char *)pc_addr;
-  int i, offset;
 
-  for (offset = 0; d != 0; d = d->next) {
-    for (i = d->more; i > 0; --i) {
+  for (size_t offset = 0; d != 0; d = d->next) {
+    for (int i = d->more; i > 0; --i) {
       if (d->data == 0)
         memset(&addr[offset], '\0', d->size);
       else
@@ -244,9 +243,8 @@ void __kmp_common_destroy_gtid(int gtid) {
 
         d_tn = __kmp_find_shared_task_common(&__kmp_threadprivate_d_table, gtid,
                                              tn->gbl_addr);
-
-        KMP_DEBUG_ASSERT(d_tn);
-
+        if (d_tn == NULL)
+          continue;
         if (d_tn->is_vec) {
           if (d_tn->dt.dtorv != 0) {
             (void)(*d_tn->dt.dtorv)(tn->par_addr, d_tn->vec_len);
diff --git a/runtime/src/kmp_utility.cpp b/runtime/src/kmp_utility.cpp
index 44a99d045..f163f0551 100644
--- a/runtime/src/kmp_utility.cpp
+++ b/runtime/src/kmp_utility.cpp
@@ -113,7 +113,7 @@ static kmp_uint64 __kmp_parse_frequency( // R: Frequency in Hz.
     } else { // Wrong unit.
       return result;
     }
-    result = value;
+    result = (kmp_uint64)value; // rounds down
   }
   return result;
 
@@ -194,7 +194,7 @@ void __kmp_query_cpuid(kmp_cpuinfo_t *p) {
       KA_TRACE(trace_level, (" PSN"));
     }
     if ((buf.edx >> 19) & 1) {
-      /* CLFULSH - Cache Flush Instruction Available */
+      /* CLFLUSH - Cache Flush Instruction Available */
       cflush_size =
           data[1] * 8; /* Bits 15-08: CLFLUSH line size = 8 (64 bytes) */
       KA_TRACE(trace_level, (" CLFLUSH(%db)", cflush_size));
diff --git a/runtime/src/kmp_version.h b/runtime/src/kmp_version.h
index 9e726b380..d21a8eed3 100644
--- a/runtime/src/kmp_version.h
+++ b/runtime/src/kmp_version.h
@@ -30,7 +30,7 @@ extern "C" {
    just before version string. */
 #define KMP_VERSION_MAGIC_STR "\x00@(#) "
 #define KMP_VERSION_MAGIC_LEN 6 // Length of KMP_VERSION_MAGIC_STR.
-#define KMP_VERSION_PREF_STR "Intel(R) OMP "
+#define KMP_VERSION_PREF_STR "BOLT based on LLVM OMP "
 #define KMP_VERSION_PREFIX KMP_VERSION_MAGIC_STR KMP_VERSION_PREF_STR
 
 /* declare all the version string constants for KMP_VERSION env. variable */
diff --git a/runtime/src/kmp_wait_release.cpp b/runtime/src/kmp_wait_release.cpp
index 7d12c74bf..ecba6d321 100644
--- a/runtime/src/kmp_wait_release.cpp
+++ b/runtime/src/kmp_wait_release.cpp
@@ -12,14 +12,35 @@
 
 #include "kmp_wait_release.h"
 
-void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64 *flag,
+#if !KMP_USE_ABT
+
+void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64<> *flag,
                    int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
   if (final_spin)
-    __kmp_wait_template<kmp_flag_64, TRUE>(
+    __kmp_wait_template<kmp_flag_64<>, TRUE>(
         this_thr, flag USE_ITT_BUILD_ARG(itt_sync_obj));
   else
-    __kmp_wait_template<kmp_flag_64, FALSE>(
+    __kmp_wait_template<kmp_flag_64<>, FALSE>(
         this_thr, flag USE_ITT_BUILD_ARG(itt_sync_obj));
 }
 
-void __kmp_release_64(kmp_flag_64 *flag) { __kmp_release_template(flag); }
+void __kmp_release_64(kmp_flag_64<> *flag) { __kmp_release_template(flag); }
+
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+template <bool C, bool S>
+void __kmp_mwait_32(int th_gtid, kmp_flag_32<C, S> *flag) {
+  __kmp_mwait_template(th_gtid, flag);
+}
+template <bool C, bool S>
+void __kmp_mwait_64(int th_gtid, kmp_flag_64<C, S> *flag) {
+  __kmp_mwait_template(th_gtid, flag);
+}
+void __kmp_mwait_oncore(int th_gtid, kmp_flag_oncore *flag) {
+  __kmp_mwait_template(th_gtid, flag);
+}
+
+template void __kmp_mwait_32<false, false>(int, kmp_flag_32<false, false> *);
+template void __kmp_mwait_64<false, true>(int, kmp_flag_64<false, true> *);
+template void __kmp_mwait_64<true, false>(int, kmp_flag_64<true, false> *);
+#endif
+#endif /* !KMP_USE_ABT */
diff --git a/runtime/src/kmp_wait_release.h b/runtime/src/kmp_wait_release.h
index b235be3cf..166bf30bc 100644
--- a/runtime/src/kmp_wait_release.h
+++ b/runtime/src/kmp_wait_release.h
@@ -20,6 +20,8 @@
 #include "ompt-specific.h"
 #endif
 
+#if !KMP_USE_ABT
+
 /*!
 @defgroup WAIT_RELEASE Wait/Release operations
 
@@ -42,20 +44,26 @@ enum flag_type {
   flag_oncore /**< special 64-bit flag for on-core barrier (hierarchical) */
 };
 
+struct flag_properties {
+  unsigned int type : 16;
+  unsigned int reserved : 16;
+};
+
 /*!
  * Base class for wait/release volatile flag
  */
 template <typename P> class kmp_flag_native {
   volatile P *loc;
-  flag_type t;
+  flag_properties t;
 
 public:
   typedef P flag_t;
-  kmp_flag_native(volatile P *p, flag_type ft) : loc(p), t(ft) {}
+  kmp_flag_native(volatile P *p, flag_type ft)
+      : loc(p), t({(short unsigned int)ft, 0U}) {}
   volatile P *get() { return loc; }
   void *get_void_p() { return RCAST(void *, CCAST(P *, loc)); }
   void set(volatile P *new_loc) { loc = new_loc; }
-  flag_type get_type() { return t; }
+  flag_type get_type() { return (flag_type)(t.type); }
   P load() { return *loc; }
   void store(P val) { *loc = val; }
 };
@@ -67,10 +75,11 @@ template <typename P> class kmp_flag {
   std::atomic<P>
       *loc; /**< Pointer to the flag storage that is modified by another thread
              */
-  flag_type t; /**< "Type" of the flag in loc */
+  flag_properties t; /**< "Type" of the flag in loc */
 public:
   typedef P flag_t;
-  kmp_flag(std::atomic<P> *p, flag_type ft) : loc(p), t(ft) {}
+  kmp_flag(std::atomic<P> *p, flag_type ft)
+      : loc(p), t({(short unsigned int)ft, 0U}) {}
   /*!
    * @result the pointer to the actual flag
    */
@@ -86,7 +95,7 @@ template <typename P> class kmp_flag {
   /*!
    * @result the flag_type
    */
-  flag_type get_type() { return t; }
+  flag_type get_type() { return (flag_type)(t.type); }
   /*!
    * @result flag value
    */
@@ -104,6 +113,7 @@ template <typename P> class kmp_flag {
   bool notdone_check();
   P internal_release();
   void suspend(int th_gtid);
+  void mwait(int th_gtid);
   void resume(int th_gtid);
   P set_sleeping();
   P unset_sleeping();
@@ -160,8 +170,8 @@ static void __ompt_implicit_task_end(kmp_info_t *this_thr,
    to wake it back up to prevent deadlocks!
 
    NOTE: We may not belong to a team at this point.  */
-template <class C, int final_spin, bool cancellable = false,
-          bool sleepable = true>
+template <class C, bool final_spin, bool Cancellable = false,
+          bool Sleepable = true>
 static inline bool
 __kmp_wait_template(kmp_info_t *this_thr,
                     C *flag USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
@@ -185,7 +195,7 @@ __kmp_wait_template(kmp_info_t *this_thr,
     return false;
   }
   th_gtid = this_thr->th.th_info.ds.ds_gtid;
-  if (cancellable) {
+  if (Cancellable) {
     kmp_team_t *team = this_thr->th.th_team;
     if (team && team->t.t_cancel_request == cancel_parallel)
       return true;
@@ -375,7 +385,7 @@ final_spin=FALSE)
     }
 #endif
     // Check if the barrier surrounding this wait loop has been cancelled
-    if (cancellable) {
+    if (Cancellable) {
       kmp_team_t *team = this_thr->th.th_team;
       if (team && team->t.t_cancel_request == cancel_parallel)
         break;
@@ -400,23 +410,31 @@ final_spin=FALSE)
 #endif
     // Don't suspend if wait loop designated non-sleepable
     // in template parameters
-    if (!sleepable)
+    if (!Sleepable)
       continue;
 
     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
         __kmp_pause_status != kmp_soft_paused)
       continue;
 
-    KF_TRACE(50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid));
-
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+    if (__kmp_mwait_enabled || __kmp_umwait_enabled) {
+      KF_TRACE(50, ("__kmp_wait_sleep: T#%d using monitor/mwait\n", th_gtid));
+      flag->mwait(th_gtid);
+    } else {
+#endif
+      KF_TRACE(50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid));
 #if KMP_OS_UNIX
-    if (final_spin)
-      KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false);
+      if (final_spin)
+        KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false);
 #endif
-    flag->suspend(th_gtid);
+      flag->suspend(th_gtid);
 #if KMP_OS_UNIX
-    if (final_spin)
-      KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true);
+      if (final_spin)
+        KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true);
+#endif
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+    }
 #endif
 
     if (TCR_4(__kmp_global.g.g_done)) {
@@ -458,7 +476,7 @@ final_spin=FALSE)
     KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false);
 #endif
   KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin));
-  if (cancellable) {
+  if (Cancellable) {
     kmp_team_t *team = this_thr->th.th_team;
     if (team && team->t.t_cancel_request == cancel_parallel) {
       if (tasks_completed) {
@@ -475,6 +493,83 @@ final_spin=FALSE)
   return false;
 }
 
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+// Set up a monitor on the flag variable causing the calling thread to wait in
+// a less active state until the flag variable is modified.
+template <class C>
+static inline void __kmp_mwait_template(int th_gtid, C *flag) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_mwait);
+  kmp_info_t *th = __kmp_threads[th_gtid];
+
+  KF_TRACE(30, ("__kmp_mwait_template: T#%d enter for flag = %p\n", th_gtid,
+                flag->get()));
+
+  // User-level mwait is available
+  KMP_DEBUG_ASSERT(__kmp_mwait_enabled || __kmp_umwait_enabled);
+
+  __kmp_suspend_initialize_thread(th);
+  __kmp_lock_suspend_mx(th);
+
+  volatile void *spin = flag->get();
+  void *cacheline = (void *)(kmp_uintptr_t(spin) & ~(CACHE_LINE - 1));
+
+  if (!flag->done_check()) {
+    // Mark thread as no longer active
+    th->th.th_active = FALSE;
+    if (th->th.th_active_in_pool) {
+      th->th.th_active_in_pool = FALSE;
+      KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
+      KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
+    }
+    flag->set_sleeping();
+    KF_TRACE(50, ("__kmp_mwait_template: T#%d calling monitor\n", th_gtid));
+#if KMP_HAVE_UMWAIT
+    if (__kmp_umwait_enabled) {
+      __kmp_umonitor(cacheline);
+    }
+#elif KMP_HAVE_MWAIT
+    if (__kmp_mwait_enabled) {
+      __kmp_mm_monitor(cacheline, 0, 0);
+    }
+#endif
+    // To avoid a race, check flag between 'monitor' and 'mwait'. A write to
+    // the address could happen after the last time we checked and before
+    // monitoring started, in which case monitor can't detect the change.
+    if (flag->done_check())
+      flag->unset_sleeping();
+    else {
+      // if flag changes here, wake-up happens immediately
+      TCW_PTR(th->th.th_sleep_loc, (void *)flag);
+      __kmp_unlock_suspend_mx(th);
+      KF_TRACE(50, ("__kmp_mwait_template: T#%d calling mwait\n", th_gtid));
+#if KMP_HAVE_UMWAIT
+      if (__kmp_umwait_enabled) {
+        __kmp_umwait(1, 100); // to do: enable ctrl via hints, backoff counter
+      }
+#elif KMP_HAVE_MWAIT
+      if (__kmp_mwait_enabled) {
+        __kmp_mm_mwait(0, __kmp_mwait_hints);
+      }
+#endif
+      KF_TRACE(50, ("__kmp_mwait_template: T#%d mwait done\n", th_gtid));
+      __kmp_lock_suspend_mx(th);
+      // Clean up sleep info; doesn't matter how/why this thread stopped waiting
+      if (flag->is_sleeping())
+        flag->unset_sleeping();
+      TCW_PTR(th->th.th_sleep_loc, NULL);
+    }
+    // Mark thread as active again
+    th->th.th_active = TRUE;
+    if (TCR_4(th->th.th_in_pool)) {
+      KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
+      th->th.th_active_in_pool = TRUE;
+    }
+  } // Drop out to main wait loop to check flag, handle tasks, etc.
+  __kmp_unlock_suspend_mx(th);
+  KF_TRACE(30, ("__kmp_mwait_template: T#%d exit\n", th_gtid));
+}
+#endif // KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+
 /* Release any threads specified as waiting on the flag by releasing the flag
    and resume the waiting thread if indicated by the sleep bit(s). A thread that
    calls __kmp_wait_template must call this function to wake up the potentially
@@ -545,7 +640,7 @@ template <> struct flag_traits<kmp_uint64> {
 };
 
 // Basic flag that does not use C11 Atomics
-template <typename FlagType>
+template <typename FlagType, bool Sleepable>
 class kmp_basic_flag_native : public kmp_flag_native<FlagType> {
   typedef flag_traits<FlagType> traits_type;
   FlagType checker; /**< Value to compare flag to to check if flag has been
@@ -588,7 +683,13 @@ class kmp_basic_flag_native : public kmp_flag_native<FlagType> {
   /*!
    * @result true if the flag object has been released.
    */
-  bool done_check() { return traits_type::tcr(*(this->get())) == checker; }
+  bool done_check() {
+    if (Sleepable)
+      return (traits_type::tcr(*(this->get())) & ~KMP_BARRIER_SLEEP_STATE) ==
+             checker;
+    else
+      return traits_type::tcr(*(this->get())) == checker;
+  }
   /*!
    * @param old_loc in   old value of flag
    * @result true if the flag's old value indicates it was released.
@@ -643,7 +744,8 @@ class kmp_basic_flag_native : public kmp_flag_native<FlagType> {
   enum barrier_type get_bt() { return bs_last_barrier; }
 };
 
-template <typename FlagType> class kmp_basic_flag : public kmp_flag<FlagType> {
+template <typename FlagType, bool Sleepable>
+class kmp_basic_flag : public kmp_flag<FlagType> {
   typedef flag_traits<FlagType> traits_type;
   FlagType checker; /**< Value to compare flag to to check if flag has been
                        released. */
@@ -685,7 +787,12 @@ template <typename FlagType> class kmp_basic_flag : public kmp_flag<FlagType> {
   /*!
    * @result true if the flag object has been released.
    */
-  bool done_check() { return this->load() == checker; }
+  bool done_check() {
+    if (Sleepable)
+      return (this->load() & ~KMP_BARRIER_SLEEP_STATE) == checker;
+    else
+      return this->load() == checker;
+  }
   /*!
    * @param old_loc in   old value of flag
    * @result true if the flag's old value indicates it was released.
@@ -736,14 +843,19 @@ template <typename FlagType> class kmp_basic_flag : public kmp_flag<FlagType> {
   enum barrier_type get_bt() { return bs_last_barrier; }
 };
 
-class kmp_flag_32 : public kmp_basic_flag<kmp_uint32> {
+template <bool Cancellable, bool Sleepable>
+class kmp_flag_32 : public kmp_basic_flag<kmp_uint32, Sleepable> {
 public:
-  kmp_flag_32(std::atomic<kmp_uint32> *p) : kmp_basic_flag<kmp_uint32>(p) {}
+  kmp_flag_32(std::atomic<kmp_uint32> *p)
+      : kmp_basic_flag<kmp_uint32, Sleepable>(p) {}
   kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_info_t *thr)
-      : kmp_basic_flag<kmp_uint32>(p, thr) {}
+      : kmp_basic_flag<kmp_uint32, Sleepable>(p, thr) {}
   kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_uint32 c)
-      : kmp_basic_flag<kmp_uint32>(p, c) {}
+      : kmp_basic_flag<kmp_uint32, Sleepable>(p, c) {}
   void suspend(int th_gtid) { __kmp_suspend_32(th_gtid, this); }
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+  void mwait(int th_gtid) { __kmp_mwait_32(th_gtid, this); }
+#endif
   void resume(int th_gtid) { __kmp_resume_32(th_gtid, this); }
   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
                     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
@@ -752,27 +864,32 @@ class kmp_flag_32 : public kmp_basic_flag<kmp_uint32> {
         this_thr, gtid, this, final_spin,
         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
   }
-  void wait(kmp_info_t *this_thr,
+  bool wait(kmp_info_t *this_thr,
             int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
     if (final_spin)
-      __kmp_wait_template<kmp_flag_32, TRUE>(
+      return __kmp_wait_template<kmp_flag_32, TRUE, Cancellable, Sleepable>(
           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
     else
-      __kmp_wait_template<kmp_flag_32, FALSE>(
+      return __kmp_wait_template<kmp_flag_32, FALSE, Cancellable, Sleepable>(
           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
   }
   void release() { __kmp_release_template(this); }
   flag_type get_ptr_type() { return flag32; }
 };
 
-class kmp_flag_64 : public kmp_basic_flag_native<kmp_uint64> {
+template <bool Cancellable, bool Sleepable>
+class kmp_flag_64 : public kmp_basic_flag_native<kmp_uint64, Sleepable> {
 public:
-  kmp_flag_64(volatile kmp_uint64 *p) : kmp_basic_flag_native<kmp_uint64>(p) {}
+  kmp_flag_64(volatile kmp_uint64 *p)
+      : kmp_basic_flag_native<kmp_uint64, Sleepable>(p) {}
   kmp_flag_64(volatile kmp_uint64 *p, kmp_info_t *thr)
-      : kmp_basic_flag_native<kmp_uint64>(p, thr) {}
+      : kmp_basic_flag_native<kmp_uint64, Sleepable>(p, thr) {}
   kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c)
-      : kmp_basic_flag_native<kmp_uint64>(p, c) {}
+      : kmp_basic_flag_native<kmp_uint64, Sleepable>(p, c) {}
   void suspend(int th_gtid) { __kmp_suspend_64(th_gtid, this); }
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+  void mwait(int th_gtid) { __kmp_mwait_64(th_gtid, this); }
+#endif
   void resume(int th_gtid) { __kmp_resume_64(th_gtid, this); }
   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
                     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
@@ -781,26 +898,14 @@ class kmp_flag_64 : public kmp_basic_flag_native<kmp_uint64> {
         this_thr, gtid, this, final_spin,
         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
   }
-  void wait(kmp_info_t *this_thr,
+  bool wait(kmp_info_t *this_thr,
             int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
     if (final_spin)
-      __kmp_wait_template<kmp_flag_64, TRUE>(
-          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
-    else
-      __kmp_wait_template<kmp_flag_64, FALSE>(
-          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
-  }
-  bool wait_cancellable_nosleep(kmp_info_t *this_thr,
-                                int final_spin
-                                    USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
-    bool retval = false;
-    if (final_spin)
-      retval = __kmp_wait_template<kmp_flag_64, TRUE, true, false>(
+      return __kmp_wait_template<kmp_flag_64, TRUE, Cancellable, Sleepable>(
           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
     else
-      retval = __kmp_wait_template<kmp_flag_64, FALSE, true, false>(
+      return __kmp_wait_template<kmp_flag_64, FALSE, Cancellable, Sleepable>(
           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
-    return retval;
   }
   void release() { __kmp_release_template(this); }
   flag_type get_ptr_type() { return flag64; }
@@ -859,7 +964,7 @@ class kmp_flag_oncore : public kmp_flag_native<kmp_uint64> {
       return true;
     else if (flag_switch) {
       this_thr->th.th_bar[bt].bb.wait_flag = KMP_BARRIER_SWITCHING;
-      kmp_flag_64 flag(&this_thr->th.th_bar[bt].bb.b_go,
+      kmp_flag_64<> flag(&this_thr->th.th_bar[bt].bb.b_go,
                        (kmp_uint64)KMP_BARRIER_STATE_BUMP);
       __kmp_wait_64(this_thr, &flag, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
     }
@@ -896,6 +1001,9 @@ class kmp_flag_oncore : public kmp_flag_native<kmp_uint64> {
   }
   void release() { __kmp_release_template(this); }
   void suspend(int th_gtid) { __kmp_suspend_oncore(th_gtid, this); }
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+  void mwait(int th_gtid) { __kmp_mwait_oncore(th_gtid, this); }
+#endif
   void resume(int th_gtid) { __kmp_resume_oncore(th_gtid, this); }
   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
                     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
@@ -915,15 +1023,15 @@ static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) {
   if (!flag)
     return;
 
-  switch (RCAST(kmp_flag_64 *, CCAST(void *, flag))->get_type()) {
+  switch (RCAST(kmp_flag_64<> *, CCAST(void *, flag))->get_type()) {
   case flag32:
-    __kmp_resume_32(gtid, NULL);
+    __kmp_resume_32(gtid, (kmp_flag_32<> *)NULL);
     break;
   case flag64:
-    __kmp_resume_64(gtid, NULL);
+    __kmp_resume_64(gtid, (kmp_flag_64<> *)NULL);
     break;
   case flag_oncore:
-    __kmp_resume_oncore(gtid, NULL);
+    __kmp_resume_oncore(gtid, (kmp_flag_oncore *)NULL);
     break;
   }
 }
@@ -932,4 +1040,6 @@ static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) {
 @}
 */
 
+#endif // !KMP_USE_ABT
+
 #endif // KMP_WAIT_RELEASE_H
diff --git a/runtime/src/kmp_wrapper_getpid.h b/runtime/src/kmp_wrapper_getpid.h
index 70db857bc..32ede3ed7 100644
--- a/runtime/src/kmp_wrapper_getpid.h
+++ b/runtime/src/kmp_wrapper_getpid.h
@@ -22,13 +22,15 @@
 #include <unistd.h>
 #if KMP_OS_DARWIN
 // OS X
-#define __kmp_gettid() syscall(SYS_thread_selfid)
+#define __kmp_gettid() pthread_mach_thread_np(pthread_self())
 #elif KMP_OS_FREEBSD
 #include <pthread_np.h>
 #define __kmp_gettid() pthread_getthreadid_np()
 #elif KMP_OS_NETBSD
 #include <lwp.h>
 #define __kmp_gettid() _lwp_self()
+#elif KMP_OS_OPENBSD
+#define __kmp_gettid() syscall(SYS_getthrid)
 #elif defined(SYS_gettid)
 // Hopefully other Unix systems define SYS_gettid syscall for getting os thread
 // id
diff --git a/runtime/src/kmp_wrapper_malloc.h b/runtime/src/kmp_wrapper_malloc.h
index a50387c7f..c027e0b29 100644
--- a/runtime/src/kmp_wrapper_malloc.h
+++ b/runtime/src/kmp_wrapper_malloc.h
@@ -15,16 +15,16 @@
 #define KMP_WRAPPER_MALLOC_H
 
 /* This header serves for 3 purposes:
-   1. Declaring standard memory allocation rourines in OS-independent way.
+   1. Declaring standard memory allocation routines in OS-independent way.
    2. Passing source location info through memory allocation wrappers.
    3. Enabling native memory debugging capabilities.
 
-   1. Declaring standard memory allocation rourines in OS-independent way.
+   1. Declaring standard memory allocation routines in OS-independent way.
    -----------------------------------------------------------------------
    On Linux* OS, alloca() function is declared in <alloca.h> header, while on
    Windows* OS there is no <alloca.h> header, function _alloca() (note
    underscore!) is declared in <malloc.h>. This header eliminates these
-   differences, so client code incluiding "kmp_wrapper_malloc.h" can rely on
+   differences, so client code including "kmp_wrapper_malloc.h" can rely on
    following routines:
 
         malloc
@@ -103,9 +103,9 @@
 #error Unknown or unsupported OS.
 #endif
 
-/* KMP_SRC_LOC_DECL -- Declaring source location paramemters, to be used in
+/* KMP_SRC_LOC_DECL -- Declaring source location parameters, to be used in
    function declaration.
-   KMP_SRC_LOC_PARM -- Source location paramemters, to be used to pass
+   KMP_SRC_LOC_PARM -- Source location parameters, to be used to pass
    parameters to underlying levels.
    KMP_SRC_LOC_CURR -- Source location arguments describing current location,
    to be used at top-level.
diff --git a/runtime/src/libomp.rc.var b/runtime/src/libomp.rc.var
index 958cd045b..6ec57e055 100644
--- a/runtime/src/libomp.rc.var
+++ b/runtime/src/libomp.rc.var
@@ -19,14 +19,14 @@ LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US    // English (U.S.) resources
 VS_VERSION_INFO VERSIONINFO
     // Parts of FILEVERSION and PRODUCTVERSION are 16-bit fields, entire build date yyyymmdd
     // does not fit into one version part, so we need to split it into yyyy and mmdd:
-    FILEVERSION    @LIBOMP_VERSION_MAJOR@,@LIBOMP_VERSION_MINOR@,@LIBOMP_VERSION_BUILD_YEAR@,@LIBOMP_VERSION_BUILD_MONTH_DAY@
-    PRODUCTVERSION @LIBOMP_VERSION_MAJOR@,@LIBOMP_VERSION_MINOR@,@LIBOMP_VERSION_BUILD_YEAR@,@LIBOMP_VERSION_BUILD_MONTH_DAY@
+    FILEVERSION    @LIBBOLT_VERSION_MAJOR@,@LIBBOLT_VERSION_MINOR@,@LIBBOLT_VERSION_BUILD_YEAR@,@LIBBOLT_VERSION_BUILD_MONTH_DAY@
+    PRODUCTVERSION @LIBBOLT_VERSION_MAJOR@,@LIBBOLT_VERSION_MINOR@,@LIBBOLT_VERSION_BUILD_YEAR@,@LIBBOLT_VERSION_BUILD_MONTH_DAY@
     FILEFLAGSMASK  VS_FFI_FILEFLAGSMASK
     FILEFLAGS      0
 #if KMP_DEBUG
         | VS_FF_DEBUG
 #endif
-#if @LIBOMP_VERSION_BUILD@ == 0
+#if @LIBBOLT_VERSION_BUILD@ == 0
         | VS_FF_PRIVATEBUILD | VS_FF_PRERELEASE
 #endif
     FILEOS          VOS_NT_WINDOWS32    // Windows* Server* 2003, XP*, 2000, or NT*
@@ -43,15 +43,15 @@ VS_VERSION_INFO VERSIONINFO
                 VALUE "CompanyName",      "LLVM\0"
                 // VALUE "LegalTrademarks",  "\0"  // Not used for now.
                 VALUE "ProductName",      "LLVM* OpenMP* Runtime Library\0"
-                VALUE "ProductVersion",   "@LIBOMP_VERSION_MAJOR@.@LIBOMP_VERSION_MINOR@\0"
-                VALUE "FileVersion",      "@LIBOMP_VERSION_BUILD@\0"
+                VALUE "ProductVersion",   "@LIBBOLT_VERSION_MAJOR@.@LIBBOLT_VERSION_MINOR@\0"
+                VALUE "FileVersion",      "@LIBBOLT_VERSION_BUILD@\0"
                 VALUE "InternalName",     "@LIBOMP_LIB_FILE@\0"
                 VALUE "OriginalFilename", "@LIBOMP_LIB_FILE@\0"
                 VALUE "Comments",
-                    "LLVM* OpenMP* @LIBOMP_LEGAL_TYPE@ Library "
-                    "version @LIBOMP_VERSION_MAJOR@.@LIBOMP_VERSION_MINOR@.@LIBOMP_VERSION_BUILD@ "
-                    "for @LIBOMP_LEGAL_ARCH@ architecture built on @LIBOMP_BUILD_DATE@.\0"
-#if @LIBOMP_VERSION_BUILD@ == 0
+                    "LLVM* OpenMP* @LIBBOLT_LEGAL_TYPE@ Library "
+                    "version @LIBBOLT_VERSION_MAJOR@.@LIBBOLT_VERSION_MINOR@.@LIBBOLT_VERSION_BUILD@ "
+                    "for @LIBBOLT_LEGAL_ARCH@ architecture built on @LIBOMP_BUILD_DATE@.\0"
+#if @LIBBOLT_VERSION_BUILD@ == 0
                     VALUE "PrivateBuild",
                         "This is a development build.\0"
 #endif
diff --git a/runtime/src/ompt-event-specific.h b/runtime/src/ompt-event-specific.h
index da6a0e424..9b780f5b2 100644
--- a/runtime/src/ompt-event-specific.h
+++ b/runtime/src/ompt-event-specific.h
@@ -56,8 +56,11 @@
 #define ompt_callback_implicit_task_implemented ompt_event_MAY_ALWAYS
 
 #define ompt_callback_target_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_target_emi_implemented ompt_event_UNIMPLEMENTED
 #define ompt_callback_target_data_op_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_target_data_op_emi_implemented ompt_event_UNIMPLEMENTED
 #define ompt_callback_target_submit_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_target_submit_emi_implemented ompt_event_UNIMPLEMENTED
 
 #define ompt_callback_control_tool_implemented ompt_event_MAY_ALWAYS
 
@@ -82,9 +85,10 @@
 
 #define ompt_callback_work_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 
-#define ompt_callback_master_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+#define ompt_callback_masked_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 
 #define ompt_callback_target_map_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_target_map_emi_implemented ompt_event_UNIMPLEMENTED
 
 #define ompt_callback_sync_region_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 
@@ -99,8 +103,10 @@
 
 #define ompt_callback_cancel_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 
-#define ompt_callback_reduction_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_reduction_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 
 #define ompt_callback_dispatch_implemented ompt_event_UNIMPLEMENTED
 
+#define ompt_callback_error_implemented ompt_event_UNIMPLEMENTED
+
 #endif
diff --git a/runtime/src/ompt-general.cpp b/runtime/src/ompt-general.cpp
index 41b282700..36bd6b55f 100644
--- a/runtime/src/ompt-general.cpp
+++ b/runtime/src/ompt-general.cpp
@@ -45,6 +45,20 @@
 #define OMPT_STR_MATCH(haystack, needle) (!strcasecmp(haystack, needle))
 #endif
 
+// prints for an enabled OMP_TOOL_VERBOSE_INIT.
+// In the future a prefix could be added in the first define, the second define
+// omits the prefix to allow for continued lines. Example: "PREFIX: Start
+// tool... Success." instead of "PREFIX: Start tool... PREFIX: Success."
+#define OMPT_VERBOSE_INIT_PRINT(...)                                           \
+  if (verbose_init)                                                            \
+  fprintf(verbose_file, __VA_ARGS__)
+#define OMPT_VERBOSE_INIT_CONTINUED_PRINT(...)                                 \
+  if (verbose_init)                                                            \
+  fprintf(verbose_file, __VA_ARGS__)
+
+static FILE *verbose_file;
+static int verbose_init;
+
 /*****************************************************************************
  * types
  ****************************************************************************/
@@ -230,6 +244,9 @@ ompt_try_start_tool(unsigned int omp_version, const char *runtime_version) {
   const char *sep = ":";
 #endif
 
+  OMPT_VERBOSE_INIT_PRINT("----- START LOGGING OF TOOL REGISTRATION -----\n");
+  OMPT_VERBOSE_INIT_PRINT("Search for OMP tool in current address space... ");
+
 #if KMP_OS_DARWIN
   // Try in the current address space
   ret = ompt_tool_darwin(omp_version, runtime_version);
@@ -240,34 +257,114 @@ ompt_try_start_tool(unsigned int omp_version, const char *runtime_version) {
 #else
 #error Activation of OMPT is not supported on this platform.
 #endif
-  if (ret)
+  if (ret) {
+    OMPT_VERBOSE_INIT_CONTINUED_PRINT("Sucess.\n");
+    OMPT_VERBOSE_INIT_PRINT(
+        "Tool was started and is using the OMPT interface.\n");
+    OMPT_VERBOSE_INIT_PRINT("----- END LOGGING OF TOOL REGISTRATION -----\n");
     return ret;
+  }
 
   // Try tool-libraries-var ICV
+  OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed.\n");
   const char *tool_libs = getenv("OMP_TOOL_LIBRARIES");
   if (tool_libs) {
+    OMPT_VERBOSE_INIT_PRINT("Searching tool libraries...\n");
+    OMPT_VERBOSE_INIT_PRINT("OMP_TOOL_LIBRARIES = %s\n", tool_libs);
     char *libs = __kmp_str_format("%s", tool_libs);
     char *buf;
     char *fname = __kmp_str_token(libs, sep, &buf);
+    // Reset dl-error
+    dlerror();
+
     while (fname) {
 #if KMP_OS_UNIX
+      OMPT_VERBOSE_INIT_PRINT("Opening %s... ", fname);
       void *h = dlopen(fname, RTLD_LAZY);
-      if (h) {
+      if (!h) {
+        OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: %s\n", dlerror());
+      } else {
+        OMPT_VERBOSE_INIT_CONTINUED_PRINT("Success. \n");
+        OMPT_VERBOSE_INIT_PRINT("Searching for ompt_start_tool in %s... ",
+                                fname);
         start_tool = (ompt_start_tool_t)dlsym(h, "ompt_start_tool");
+        if (!start_tool) {
+          OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: %s\n", dlerror());
+        } else
 #elif KMP_OS_WINDOWS
+      OMPT_VERBOSE_INIT_PRINT("Opening %s... ", fname);
       HMODULE h = LoadLibrary(fname);
-      if (h) {
+      if (!h) {
+        OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: Error %u\n", GetLastError());
+      } else {
+        OMPT_VERBOSE_INIT_CONTINUED_PRINT("Success. \n");
+        OMPT_VERBOSE_INIT_PRINT("Searching for ompt_start_tool in %s... ",
+                                fname);
         start_tool = (ompt_start_tool_t)GetProcAddress(h, "ompt_start_tool");
+        if (!start_tool) {
+          OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: Error %s\n",
+                                            GetLastError());
+        } else
 #else
 #error Activation of OMPT is not supported on this platform.
 #endif
-        if (start_tool && (ret = (*start_tool)(omp_version, runtime_version)))
-          break;
+        {// if (start_tool)
+          ret = (*start_tool)(omp_version, runtime_version);
+          if (ret) {
+            OMPT_VERBOSE_INIT_CONTINUED_PRINT("Success.\n");
+            OMPT_VERBOSE_INIT_PRINT(
+                "Tool was started and is using the OMPT interface.\n");
+            break;
+          }
+          OMPT_VERBOSE_INIT_CONTINUED_PRINT(
+              "Found but not using the OMPT interface.\n");
+          OMPT_VERBOSE_INIT_PRINT("Continuing search...\n");
+        }
       }
       fname = __kmp_str_token(NULL, sep, &buf);
     }
     __kmp_str_free(&libs);
+  } else {
+    OMPT_VERBOSE_INIT_PRINT("No OMP_TOOL_LIBRARIES defined.\n");
+  }
+
+  // usable tool found in tool-libraries
+  if (ret) {
+    OMPT_VERBOSE_INIT_PRINT("----- END LOGGING OF TOOL REGISTRATION -----\n");
+    return ret;
   }
+
+#if KMP_OS_UNIX
+  { // Non-standard: load archer tool if application is built with TSan
+    const char *fname = "libarcher.so";
+    OMPT_VERBOSE_INIT_PRINT(
+        "...searching tool libraries failed. Using archer tool.\n");
+    OMPT_VERBOSE_INIT_PRINT("Opening %s... ", fname);
+    void *h = dlopen(fname, RTLD_LAZY);
+    if (h) {
+      OMPT_VERBOSE_INIT_CONTINUED_PRINT("Success.\n");
+      OMPT_VERBOSE_INIT_PRINT("Searching for ompt_start_tool in %s... ", fname);
+      start_tool = (ompt_start_tool_t)dlsym(h, "ompt_start_tool");
+      if (start_tool) {
+        ret = (*start_tool)(omp_version, runtime_version);
+        if (ret) {
+          OMPT_VERBOSE_INIT_CONTINUED_PRINT("Success.\n");
+          OMPT_VERBOSE_INIT_PRINT(
+              "Tool was started and is using the OMPT interface.\n");
+          OMPT_VERBOSE_INIT_PRINT(
+              "----- END LOGGING OF TOOL REGISTRATION -----\n");
+          return ret;
+        }
+        OMPT_VERBOSE_INIT_CONTINUED_PRINT(
+            "Found but not using the OMPT interface.\n");
+      } else {
+        OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: %s\n", dlerror());
+      }
+    }
+  }
+#endif
+  OMPT_VERBOSE_INIT_PRINT("No OMP tool loaded.\n");
+  OMPT_VERBOSE_INIT_PRINT("----- END LOGGING OF TOOL REGISTRATION -----\n");
   return ret;
 }
 
@@ -295,11 +392,27 @@ void ompt_pre_init() {
   else if (OMPT_STR_MATCH(ompt_env_var, "enabled"))
     tool_setting = omp_tool_enabled;
 
+  const char *ompt_env_verbose_init = getenv("OMP_TOOL_VERBOSE_INIT");
+  // possible options: disabled | stdout | stderr | <filename>
+  // if set, not empty and not disabled -> prepare for logging
+  if (ompt_env_verbose_init && strcmp(ompt_env_verbose_init, "") &&
+      !OMPT_STR_MATCH(ompt_env_verbose_init, "disabled")) {
+    verbose_init = 1;
+    if (OMPT_STR_MATCH(ompt_env_verbose_init, "STDERR"))
+      verbose_file = stderr;
+    else if (OMPT_STR_MATCH(ompt_env_verbose_init, "STDOUT"))
+      verbose_file = stdout;
+    else
+      verbose_file = fopen(ompt_env_verbose_init, "w");
+  } else
+    verbose_init = 0;
+
 #if OMPT_DEBUG
   printf("ompt_pre_init(): tool_setting = %d\n", tool_setting);
 #endif
   switch (tool_setting) {
   case omp_tool_disabled:
+    OMPT_VERBOSE_INIT_PRINT("OMP tool disabled. \n");
     break;
 
   case omp_tool_unset:
@@ -321,6 +434,8 @@ void ompt_pre_init() {
             ompt_env_var);
     break;
   }
+  if (verbose_init && verbose_file != stderr && verbose_file != stdout)
+    fclose(verbose_file);
 #if OMPT_DEBUG
   printf("ompt_pre_init(): ompt_enabled = %d\n", ompt_enabled);
 #endif
diff --git a/runtime/src/ompt-internal.h b/runtime/src/ompt-internal.h
index 958b5943a..f753ab4eb 100644
--- a/runtime/src/ompt-internal.h
+++ b/runtime/src/ompt-internal.h
@@ -57,8 +57,6 @@ typedef struct {
   ompt_data_t task_data;
   struct kmp_taskdata *scheduling_parent;
   int thread_num;
-  int ndeps;
-  ompt_dependence_t *deps;
 } ompt_task_info_t;
 
 typedef struct {
diff --git a/runtime/src/ompt-specific.cpp b/runtime/src/ompt-specific.cpp
index 7fb81bb7d..c74426c30 100644
--- a/runtime/src/ompt-specific.cpp
+++ b/runtime/src/ompt-specific.cpp
@@ -27,7 +27,7 @@
 #define THREAD_LOCAL __thread
 #endif
 
-#define OMPT_WEAK_ATTRIBUTE KMP_WEAK_ATTRIBUTE
+#define OMPT_WEAK_ATTRIBUTE KMP_WEAK_ATTRIBUTE_INTERNAL
 
 //******************************************************************************
 // macros
@@ -262,8 +262,6 @@ void __ompt_lw_taskteam_init(ompt_lw_taskteam_t *lwt, kmp_info_t *thr, int gtid,
   lwt->ompt_task_info.frame.enter_frame = ompt_data_none;
   lwt->ompt_task_info.frame.exit_frame = ompt_data_none;
   lwt->ompt_task_info.scheduling_parent = NULL;
-  lwt->ompt_task_info.deps = NULL;
-  lwt->ompt_task_info.ndeps = 0;
   lwt->heap = 0;
   lwt->parent = 0;
 }
@@ -457,7 +455,7 @@ int __ompt_get_task_memory_internal(void **addr, size_t *size, int blocknum) {
     return 0;
 
   *addr = ret_addr;
-  *size = ret_size;
+  *size = (size_t)ret_size;
   return 1;
 }
 
diff --git a/runtime/src/ompt-specific.h b/runtime/src/ompt-specific.h
index 47d8a1669..18816e733 100644
--- a/runtime/src/ompt-specific.h
+++ b/runtime/src/ompt-specific.h
@@ -15,6 +15,7 @@
 
 #include "kmp.h"
 
+#if OMPT_SUPPORT
 /*****************************************************************************
  * forward declarations
  ****************************************************************************/
@@ -74,12 +75,19 @@ inline void *__ompt_load_return_address(int gtid) {
   return return_address;
 }
 
-#define OMPT_STORE_RETURN_ADDRESS(gtid)                                        \
+/*#define OMPT_STORE_RETURN_ADDRESS(gtid) \
   if (ompt_enabled.enabled && gtid >= 0 && __kmp_threads[gtid] &&              \
       !__kmp_threads[gtid]->th.ompt_thread_info.return_address)                \
   __kmp_threads[gtid]->th.ompt_thread_info.return_address =                    \
-      __builtin_return_address(0)
+      __builtin_return_address(0)*/
+#define OMPT_STORE_RETURN_ADDRESS(gtid)                                        \
+  OmptReturnAddressGuard ReturnAddressGuard{gtid, __builtin_return_address(0)};
 #define OMPT_LOAD_RETURN_ADDRESS(gtid) __ompt_load_return_address(gtid)
+#define OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid)                                  \
+  ((ompt_enabled.enabled && gtid >= 0 && __kmp_threads[gtid] &&                \
+      __kmp_threads[gtid]->th.ompt_thread_info.return_address)?                \
+      __ompt_load_return_address(gtid):                                        \
+      __builtin_return_address(0))
 
 //******************************************************************************
 // inline functions
@@ -102,4 +110,49 @@ inline const char *ompt_get_runtime_version() {
   return &__kmp_version_lib_ver[KMP_VERSION_MAGIC_LEN];
 }
 
+class OmptReturnAddressGuard {
+private:
+  bool SetAddress{false};
+  int Gtid;
+
+public:
+  OmptReturnAddressGuard(int Gtid, void *ReturnAddress) : Gtid(Gtid) {
+    if (ompt_enabled.enabled && Gtid >= 0 && __kmp_threads[Gtid] &&
+        !__kmp_threads[Gtid]->th.ompt_thread_info.return_address) {
+      SetAddress = true;
+      __kmp_threads[Gtid]->th.ompt_thread_info.return_address = ReturnAddress;
+    }
+  }
+  ~OmptReturnAddressGuard() {
+    if (SetAddress)
+      __kmp_threads[Gtid]->th.ompt_thread_info.return_address = NULL;
+  }
+};
+
+#endif // OMPT_SUPPORT
+
+// macros providing the OMPT callbacks for reduction clause
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+#define OMPT_REDUCTION_DECL(this_thr, gtid)                                    \
+  ompt_data_t *my_task_data = OMPT_CUR_TASK_DATA(this_thr);                    \
+  ompt_data_t *my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);                \
+  void *return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
+#define OMPT_REDUCTION_BEGIN                                                   \
+  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) {          \
+    ompt_callbacks.ompt_callback(ompt_callback_reduction)(                     \
+        ompt_sync_region_reduction, ompt_scope_begin, my_parallel_data,        \
+        my_task_data, return_address);                                         \
+  }
+#define OMPT_REDUCTION_END                                                     \
+  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) {          \
+    ompt_callbacks.ompt_callback(ompt_callback_reduction)(                     \
+        ompt_sync_region_reduction, ompt_scope_end, my_parallel_data,          \
+        my_task_data, return_address);                                         \
+  }
+#else // OMPT_SUPPORT && OMPT_OPTIONAL
+#define OMPT_REDUCTION_DECL(this_thr, gtid)
+#define OMPT_REDUCTION_BEGIN
+#define OMPT_REDUCTION_END
+#endif // ! OMPT_SUPPORT && OMPT_OPTIONAL
+
 #endif
diff --git a/runtime/src/thirdparty/ittnotify/ittnotify.h b/runtime/src/thirdparty/ittnotify/ittnotify.h
index ed46cd776..db1c0d0d9 100644
--- a/runtime/src/thirdparty/ittnotify/ittnotify.h
+++ b/runtime/src/thirdparty/ittnotify/ittnotify.h
@@ -1448,7 +1448,7 @@ ITT_STUBV(ITTAPI, void, heap_allocate_end, (__itt_heap_function h, void** addr,
 /** @endcond */
 
 /**
- * @brief Record an free begin occurrence.
+ * @brief Record a free begin occurrence.
  */
 void ITTAPI __itt_heap_free_begin(__itt_heap_function h, void* addr);
 
@@ -1468,7 +1468,7 @@ ITT_STUBV(ITTAPI, void, heap_free_begin, (__itt_heap_function h, void* addr))
 /** @endcond */
 
 /**
- * @brief Record an free end occurrence.
+ * @brief Record a free end occurrence.
  */
 void ITTAPI __itt_heap_free_end(__itt_heap_function h, void* addr);
 
@@ -1488,7 +1488,7 @@ ITT_STUBV(ITTAPI, void, heap_free_end, (__itt_heap_function h, void* addr))
 /** @endcond */
 
 /**
- * @brief Record an reallocation begin occurrence.
+ * @brief Record a reallocation begin occurrence.
  */
 void ITTAPI __itt_heap_reallocate_begin(__itt_heap_function h, void* addr, size_t new_size, int initialized);
 
@@ -1508,7 +1508,7 @@ ITT_STUBV(ITTAPI, void, heap_reallocate_begin, (__itt_heap_function h, void* add
 /** @endcond */
 
 /**
- * @brief Record an reallocation end occurrence.
+ * @brief Record a reallocation end occurrence.
  */
 void ITTAPI __itt_heap_reallocate_end(__itt_heap_function h, void* addr, void** new_addr, size_t new_size, int initialized);
 
@@ -2303,7 +2303,7 @@ ITT_STUBV(ITTAPI, void, marker, (const __itt_domain *domain, __itt_id id, __itt_
  * name of the metadata), and a value (the actual data). The encoding of
  * the value depends on the type of the metadata.
  *
- * The type of metadata is specified by an enumerated type __itt_metdata_type.
+ * The type of metadata is specified by an enumerated type __itt_metadata_type.
  * @{
  */
 
@@ -2644,7 +2644,7 @@ ITT_STUB(ITTAPI, __itt_clock_domain*, clock_domain_create, (__itt_get_clock_info
 
 /**
  * @ingroup clockdomains
- * @brief Recalculate clock domains frequences and clock base timestamps.
+ * @brief Recalculate clock domains frequencies and clock base timestamps.
  */
 void ITTAPI __itt_clock_domain_reset(void);
 
@@ -3196,7 +3196,7 @@ ITT_STUBV(ITTAPI, void, relation_add_ex,            (const __itt_domain *domain,
 #define __itt_relation_add_ex(d,x,y,z,a,b)          ITTNOTIFY_VOID_D5(relation_add_ex,d,x,y,z,a,b)
 #define __itt_relation_add_ex_ptr                   ITTNOTIFY_NAME(relation_add_ex)
 #else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_relation_add_to_current_ex(domain,clock_domain,timestame,relation,tail)
+#define __itt_relation_add_to_current_ex(domain,clock_domain,timestamp,relation,tail)
 #define __itt_relation_add_to_current_ex_ptr 0
 #define __itt_relation_add_ex(domain,clock_domain,timestamp,head,relation,tail)
 #define __itt_relation_add_ex_ptr 0
@@ -3957,7 +3957,7 @@ ITT_STUB(ITTAPI, __itt_caller, stack_caller_create, (void))
 /** @endcond */
 
 /**
- * @brief Destroy the inforamtion about stitch point identified by the pointer previously returned by __itt_stack_caller_create()
+ * @brief Destroy the information about stitch point identified by the pointer previously returned by __itt_stack_caller_create()
  */
 void ITTAPI __itt_stack_caller_destroy(__itt_caller id);
 
diff --git a/runtime/src/thirdparty/ittnotify/ittnotify_static.cpp b/runtime/src/thirdparty/ittnotify/ittnotify_static.cpp
index c48b3f420..4936b9baa 100644
--- a/runtime/src/thirdparty/ittnotify/ittnotify_static.cpp
+++ b/runtime/src/thirdparty/ittnotify/ittnotify_static.cpp
@@ -762,7 +762,7 @@ static const char* __itt_fsplit(const char* s, const char* sep, const char** out
 
 /* This function return value of env variable that placed into static buffer.
  * !!! The same static buffer is used for subsequent calls. !!!
- * This was done to aviod dynamic allocation for few calls.
+ * This was done to avoid dynamic allocation for few calls.
  * Actually we need this function only four times.
  */
 static const char* __itt_get_env_var(const char* name)
@@ -786,7 +786,7 @@ static const char* __itt_get_env_var(const char* name)
         }
         else
         {
-            /* If environment variable is empty, GetEnvirornmentVariables()
+            /* If environment variable is empty, GetEnvironmentVariables()
              * returns zero (number of characters (not including terminating null),
              * and GetLastError() returns ERROR_SUCCESS. */
             DWORD err = GetLastError();
@@ -1012,7 +1012,7 @@ static void __itt_reinit_all_pointers(void)
 static void __itt_nullify_all_pointers(void)
 {
     int i;
-    /* Nulify all pointers except domain_create, string_handle_create  and counter_create */
+    /* Nullify all pointers except domain_create, string_handle_create  and counter_create */
     for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
         *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
 }
diff --git a/runtime/src/thirdparty/ittnotify/legacy/ittnotify.h b/runtime/src/thirdparty/ittnotify/legacy/ittnotify.h
index eae33e0b1..a4061e168 100644
--- a/runtime/src/thirdparty/ittnotify/legacy/ittnotify.h
+++ b/runtime/src/thirdparty/ittnotify/legacy/ittnotify.h
@@ -957,9 +957,9 @@ ITT_STUB(ITTAPI, __itt_frame, frame_create,  (const char *domain))
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 
-/** @brief Record an frame begin occurrence. */
+/** @brief Record a frame begin occurrence. */
 void ITTAPI __itt_frame_begin(__itt_frame frame);
-/** @brief Record an frame end occurrence. */
+/** @brief Record a frame end occurrence. */
 void ITTAPI __itt_frame_end  (__itt_frame frame);
 
 /** @cond exclude_from_documentation */
diff --git a/runtime/src/z_Linux_asm.S b/runtime/src/z_Linux_asm.S
index b491fcf18..4117f1ff4 100644
--- a/runtime/src/z_Linux_asm.S
+++ b/runtime/src/z_Linux_asm.S
@@ -36,8 +36,8 @@
 // that the caller determines based on the total # threads / # cores.
 //
 //.macro pause_op
-//	mov    $100, %rax
-//	delay  %rax
+//  mov    $100, %rax
+//  delay  %rax
 //.endm
 # else
 #  define pause_op   .byte 0xf3,0x90
@@ -55,7 +55,7 @@
 .macro KMP_CFI_DEF
 .endmacro
 .macro ALIGN
-	.align $0
+    .align $0
 .endmacro
 .macro DEBUG_INFO
 /* Not sure what .size does in icc, not sure if we need to do something
@@ -63,8 +63,8 @@
 */
 .endmacro
 .macro PROC
-	ALIGN  4
-	.globl KMP_PREFIX_UNDERSCORE($0)
+    ALIGN  4
+    .globl KMP_PREFIX_UNDERSCORE($0)
 KMP_PREFIX_UNDERSCORE($0):
 .endmacro
 # else // KMP_OS_DARWIN
@@ -78,32 +78,32 @@ KMP_PREFIX_UNDERSCORE($0):
 #  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
 # endif // KMP_MIC
 .macro ALIGN size
-	.align 1<<(\size)
+    .align 1<<(\size)
 .endm
 .macro DEBUG_INFO proc
-	.cfi_endproc
+    .cfi_endproc
 // Not sure why we need .type and .size for the functions
-	.align 16
-	.type  \proc,@function
+    .align 16
+    .type  \proc,@function
         .size  \proc,.-\proc
 .endm
 .macro PROC proc
-	ALIGN  4
+    ALIGN  4
         .globl KMP_PREFIX_UNDERSCORE(\proc)
 KMP_PREFIX_UNDERSCORE(\proc):
-	.cfi_startproc
+    .cfi_startproc
 .endm
 .macro KMP_CFI_DEF_OFFSET sz
-	.cfi_def_cfa_offset	\sz
+    .cfi_def_cfa_offset \sz
 .endm
 .macro KMP_CFI_OFFSET reg, sz
-	.cfi_offset	\reg,\sz
+    .cfi_offset \reg,\sz
 .endm
 .macro KMP_CFI_REGISTER reg
-	.cfi_def_cfa_register	\reg
+    .cfi_def_cfa_register   \reg
 .endm
 .macro KMP_CFI_DEF reg, sz
-	.cfi_def_cfa	\reg,\sz
+    .cfi_def_cfa    \reg,\sz
 .endm
 # endif // KMP_OS_DARWIN
 #endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
@@ -115,7 +115,7 @@ KMP_PREFIX_UNDERSCORE(\proc):
 #  define KMP_LABEL(x) L_##x             // form the name of label
 
 .macro ALIGN
-	.align $0
+    .align $0
 .endmacro
 
 .macro DEBUG_INFO
@@ -125,8 +125,8 @@ KMP_PREFIX_UNDERSCORE(\proc):
 .endmacro
 
 .macro PROC
-	ALIGN  4
-	.globl KMP_PREFIX_UNDERSCORE($0)
+    ALIGN  4
+    .globl KMP_PREFIX_UNDERSCORE($0)
 KMP_PREFIX_UNDERSCORE($0):
 .endmacro
 # else // KMP_OS_DARWIN
@@ -135,22 +135,22 @@ KMP_PREFIX_UNDERSCORE($0):
 #  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
 
 .macro ALIGN size
-	.align 1<<(\size)
+    .align 1<<(\size)
 .endm
 
 .macro DEBUG_INFO proc
-	.cfi_endproc
+    .cfi_endproc
 // Not sure why we need .type and .size for the functions
-	ALIGN 2
-	.type  \proc,@function
-	.size  \proc,.-\proc
+    ALIGN 2
+    .type  \proc,@function
+    .size  \proc,.-\proc
 .endm
 
 .macro PROC proc
-	ALIGN 2
-	.globl KMP_PREFIX_UNDERSCORE(\proc)
+    ALIGN 2
+    .globl KMP_PREFIX_UNDERSCORE(\proc)
 KMP_PREFIX_UNDERSCORE(\proc):
-	.cfi_startproc
+    .cfi_startproc
 .endm
 # endif // KMP_OS_DARWIN
 
@@ -179,7 +179,7 @@ ___kmp_unnamed_critical_addr:
         .data
         .comm .gomp_critical_user_,32,8
         .data
-	ALIGN 4
+    ALIGN 4
         .global __kmp_unnamed_critical_addr
 __kmp_unnamed_critical_addr:
         .4byte .gomp_critical_user_
@@ -200,7 +200,7 @@ ___kmp_unnamed_critical_addr:
         .data
         .comm .gomp_critical_user_,32,8
         .data
-	ALIGN 8
+    ALIGN 8
         .global __kmp_unnamed_critical_addr
 __kmp_unnamed_critical_addr:
         .8byte .gomp_critical_user_
@@ -219,19 +219,19 @@ __kmp_unnamed_critical_addr:
 // running Linux* OS
 // -----------------------------------------------------------------------
 
-	.ident "Intel Corporation"
-	.data
-	ALIGN 4
+    .ident "Intel Corporation"
+    .data
+    ALIGN 4
 // void
 // __kmp_x86_pause( void );
 
         .text
-	PROC  __kmp_x86_pause
+    PROC  __kmp_x86_pause
 
         pause_op
         ret
 
-	DEBUG_INFO __kmp_x86_pause
+    DEBUG_INFO __kmp_x86_pause
 
 # if !KMP_ASM_INTRINS
 
@@ -247,7 +247,7 @@ __kmp_unnamed_critical_addr:
         xaddl     %eax,(%ecx)
         ret
 
-	DEBUG_INFO __kmp_test_then_add32
+    DEBUG_INFO __kmp_test_then_add32
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_xchg_fixed8
@@ -256,14 +256,14 @@ __kmp_unnamed_critical_addr:
 // __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
 //
 // parameters:
-// 	p:	4(%esp)
-// 	d:	8(%esp)
+//  p:  4(%esp)
+//  d:  8(%esp)
 //
-// return:	%al
+// return:  %al
         PROC  __kmp_xchg_fixed8
 
         movl      4(%esp), %ecx    // "p"
-        movb      8(%esp), %al	// "d"
+        movb      8(%esp), %al  // "d"
 
         lock
         xchgb     %al,(%ecx)
@@ -279,13 +279,13 @@ __kmp_unnamed_critical_addr:
 // __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
 //
 // parameters:
-// 	p:	4(%esp)
-// 	d:	8(%esp)
+//  p:  4(%esp)
+//  d:  8(%esp)
 // return:     %ax
         PROC  __kmp_xchg_fixed16
 
         movl      4(%esp), %ecx    // "p"
-        movw      8(%esp), %ax	// "d"
+        movw      8(%esp), %ax  // "d"
 
         lock
         xchgw     %ax,(%ecx)
@@ -301,14 +301,14 @@ __kmp_unnamed_critical_addr:
 // __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
 //
 // parameters:
-// 	p:	4(%esp)
-// 	d:	8(%esp)
+//  p:  4(%esp)
+//  d:  8(%esp)
 //
-// return:	%eax
+// return:  %eax
         PROC  __kmp_xchg_fixed32
 
         movl      4(%esp), %ecx    // "p"
-        movl      8(%esp), %eax	// "d"
+        movl      8(%esp), %eax // "d"
 
         lock
         xchgl     %eax,(%ecx)
@@ -460,10 +460,10 @@ __kmp_unnamed_critical_addr:
 // __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
 //
 // parameters:
-// 	addr:	4(%esp)
-// 	data:	8(%esp)
+//  addr:   4(%esp)
+//  data:   8(%esp)
 //
-// return:	%eax
+// return:  %eax
         PROC  __kmp_xchg_real32
 
         pushl   %ebp
@@ -494,6 +494,7 @@ __kmp_unnamed_critical_addr:
 
 # endif /* !KMP_ASM_INTRINS */
 
+#if !KMP_USE_ABT
 //------------------------------------------------------------------------
 // int
 // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
@@ -514,84 +515,85 @@ __kmp_unnamed_critical_addr:
 
 // -- Begin __kmp_invoke_microtask
 // mark_begin;
-	PROC  __kmp_invoke_microtask
-
-	pushl %ebp
-	KMP_CFI_DEF_OFFSET 8
-	KMP_CFI_OFFSET ebp,-8
-	movl %esp,%ebp		// establish the base pointer for this routine.
-	KMP_CFI_REGISTER ebp
-	subl $8,%esp		// allocate space for two local variables.
-				// These varibales are:
-				//	argv: -4(%ebp)
-				//	temp: -8(%ebp)
-				//
-	pushl %ebx		// save %ebx to use during this routine
-				//
+    PROC  __kmp_invoke_microtask
+
+    pushl %ebp
+    KMP_CFI_DEF_OFFSET 8
+    KMP_CFI_OFFSET ebp,-8
+    movl %esp,%ebp      // establish the base pointer for this routine.
+    KMP_CFI_REGISTER ebp
+    subl $8,%esp        // allocate space for two local variables.
+                // These varibales are:
+                //  argv: -4(%ebp)
+                //  temp: -8(%ebp)
+                //
+    pushl %ebx      // save %ebx to use during this routine
+                //
 #if OMPT_SUPPORT
-	movl 28(%ebp),%ebx	// get exit_frame address
-	movl %ebp,(%ebx)	// save exit_frame
+    movl 28(%ebp),%ebx  // get exit_frame address
+    movl %ebp,(%ebx)    // save exit_frame
 #endif
 
-	movl 20(%ebp),%ebx	// Stack alignment - # args
-	addl $2,%ebx		// #args +2  Always pass at least 2 args (gtid and tid)
-	shll $2,%ebx		// Number of bytes used on stack: (#args+2)*4
-	movl %esp,%eax		//
-	subl %ebx,%eax		// %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
-	movl %eax,%ebx		// Save to %ebx
-	andl $0xFFFFFF80,%eax	// mask off 7 bits
-	subl %eax,%ebx		// Amount to subtract from %esp
-	subl %ebx,%esp		// Prepare the stack ptr --
-				//   now it will be aligned on 128-byte boundary at the call
+    movl 20(%ebp),%ebx  // Stack alignment - # args
+    addl $2,%ebx        // #args +2  Always pass at least 2 args (gtid and tid)
+    shll $2,%ebx        // Number of bytes used on stack: (#args+2)*4
+    movl %esp,%eax      //
+    subl %ebx,%eax      // %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
+    movl %eax,%ebx      // Save to %ebx
+    andl $0xFFFFFF80,%eax   // mask off 7 bits
+    subl %eax,%ebx      // Amount to subtract from %esp
+    subl %ebx,%esp      // Prepare the stack ptr --
+                //   now it will be aligned on 128-byte boundary at the call
 
-	movl 24(%ebp),%eax	// copy from p_argv[]
-	movl %eax,-4(%ebp)	// into the local variable *argv.
+    movl 24(%ebp),%eax  // copy from p_argv[]
+    movl %eax,-4(%ebp)  // into the local variable *argv.
 
-	movl 20(%ebp),%ebx	// argc is 20(%ebp)
-	shll $2,%ebx
+    movl 20(%ebp),%ebx  // argc is 20(%ebp)
+    shll $2,%ebx
 
 KMP_LABEL(invoke_2):
-	cmpl $0,%ebx
-	jg  KMP_LABEL(invoke_4)
-	jmp KMP_LABEL(invoke_3)
-	ALIGN 2
+    cmpl $0,%ebx
+    jg  KMP_LABEL(invoke_4)
+    jmp KMP_LABEL(invoke_3)
+    ALIGN 2
 KMP_LABEL(invoke_4):
-	movl -4(%ebp),%eax
-	subl $4,%ebx			// decrement argc.
-	addl %ebx,%eax			// index into argv.
-	movl (%eax),%edx
-	pushl %edx
-
-	jmp KMP_LABEL(invoke_2)
-	ALIGN 2
+    movl -4(%ebp),%eax
+    subl $4,%ebx            // decrement argc.
+    addl %ebx,%eax          // index into argv.
+    movl (%eax),%edx
+    pushl %edx
+
+    jmp KMP_LABEL(invoke_2)
+    ALIGN 2
 KMP_LABEL(invoke_3):
-	leal 16(%ebp),%eax		// push & tid
-	pushl %eax
+    leal 16(%ebp),%eax      // push & tid
+    pushl %eax
 
-	leal 12(%ebp),%eax		// push & gtid
-	pushl %eax
+    leal 12(%ebp),%eax      // push & gtid
+    pushl %eax
 
-	movl 8(%ebp),%ebx
-	call *%ebx			// call (*pkfn)();
+    movl 8(%ebp),%ebx
+    call *%ebx          // call (*pkfn)();
 
-	movl $1,%eax			// return 1;
+    movl $1,%eax            // return 1;
 
-	movl -12(%ebp),%ebx		// restore %ebx
-	leave
-	KMP_CFI_DEF esp,4
-	ret
+    movl -12(%ebp),%ebx     // restore %ebx
+    leave
+    KMP_CFI_DEF esp,4
+    ret
 
-	DEBUG_INFO __kmp_invoke_microtask
+    DEBUG_INFO __kmp_invoke_microtask
 // -- End  __kmp_invoke_microtask
+#endif /* !KMP_USE_ABT */
 
 
 // kmp_uint64
 // __kmp_hardware_timestamp(void)
-	PROC  __kmp_hardware_timestamp
-	rdtsc
-	ret
+    PROC  __kmp_hardware_timestamp
+    rdtsc
+    ret
 
-	DEBUG_INFO __kmp_hardware_timestamp
+    DEBUG_INFO __kmp_hardware_timestamp
 // -- End  __kmp_hardware_timestamp
 
 #endif /* KMP_ARCH_X86 */
@@ -606,10 +608,10 @@ KMP_LABEL(invoke_3):
 
 // -- Machine type P
 // mark_description "Intel Corporation";
-	.ident "Intel Corporation"
-// --	.file "z_Linux_asm.S"
-	.data
-	ALIGN 4
+    .ident "Intel Corporation"
+// --   .file "z_Linux_asm.S"
+    .data
+    ALIGN 4
 
 // To prevent getting our code into .data section .text added to every routine
 // definition for x86_64.
@@ -623,14 +625,14 @@ KMP_LABEL(invoke_3):
 // __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
 //
 // parameters:
-// 	p:	%rdi
-// 	d:	%esi
+//  p:  %rdi
+//  d:  %esi
 //
-// return:	%eax
+// return:  %eax
         .text
         PROC  __kmp_test_then_add32
 
-        movl      %esi, %eax	// "d"
+        movl      %esi, %eax    // "d"
         lock
         xaddl     %eax,(%rdi)
         ret
@@ -645,13 +647,13 @@ KMP_LABEL(invoke_3):
 // __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
 //
 // parameters:
-// 	p:	%rdi
-// 	d:	%rsi
-//	return:	%rax
+//  p:  %rdi
+//  d:  %rsi
+//  return: %rax
         .text
         PROC  __kmp_test_then_add64
 
-        movq      %rsi, %rax	// "d"
+        movq      %rsi, %rax    // "d"
         lock
         xaddq     %rax,(%rdi)
         ret
@@ -666,14 +668,14 @@ KMP_LABEL(invoke_3):
 // __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
 //
 // parameters:
-// 	p:	%rdi
-// 	d:	%sil
+//  p:  %rdi
+//  d:  %sil
 //
-// return:	%al
+// return:  %al
         .text
         PROC  __kmp_xchg_fixed8
 
-        movb      %sil, %al	// "d"
+        movb      %sil, %al // "d"
 
         lock
         xchgb     %al,(%rdi)
@@ -689,13 +691,13 @@ KMP_LABEL(invoke_3):
 // __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
 //
 // parameters:
-// 	p:	%rdi
-// 	d:	%si
+//  p:  %rdi
+//  d:  %si
 // return:     %ax
         .text
         PROC  __kmp_xchg_fixed16
 
-        movw      %si, %ax	// "d"
+        movw      %si, %ax  // "d"
 
         lock
         xchgw     %ax,(%rdi)
@@ -711,14 +713,14 @@ KMP_LABEL(invoke_3):
 // __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
 //
 // parameters:
-// 	p:	%rdi
-// 	d:	%esi
+//  p:  %rdi
+//  d:  %esi
 //
-// return:	%eax
+// return:  %eax
         .text
         PROC  __kmp_xchg_fixed32
 
-        movl      %esi, %eax	// "d"
+        movl      %esi, %eax    // "d"
 
         lock
         xchgl     %eax,(%rdi)
@@ -734,13 +736,13 @@ KMP_LABEL(invoke_3):
 // __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
 //
 // parameters:
-// 	p:	%rdi
-// 	d:	%rsi
-// return:	%rax
+//  p:  %rdi
+//  d:  %rsi
+// return:  %rax
         .text
         PROC  __kmp_xchg_fixed64
 
-        movq      %rsi, %rax	// "d"
+        movq      %rsi, %rax    // "d"
 
         lock
         xchgq     %rax,(%rdi)
@@ -756,15 +758,15 @@ KMP_LABEL(invoke_3):
 // __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
 //
 // parameters:
-// 	p:	%rdi
-// 	cv:	%esi
-//	sv:	%edx
+//  p:  %rdi
+//  cv: %esi
+//  sv: %edx
 //
-// return:	%eax
+// return:  %eax
         .text
         PROC  __kmp_compare_and_store8
 
-        movb      %sil, %al	// "cv"
+        movb      %sil, %al // "cv"
         lock
         cmpxchgb  %dl,(%rdi)
         sete      %al           // if %al == (%rdi) set %al = 1 else set %al = 0
@@ -781,15 +783,15 @@ KMP_LABEL(invoke_3):
 // __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
 //
 // parameters:
-// 	p:	%rdi
-// 	cv:	%si
-//	sv:	%dx
+//  p:  %rdi
+//  cv: %si
+//  sv: %dx
 //
-// return:	%eax
+// return:  %eax
         .text
         PROC  __kmp_compare_and_store16
 
-        movw      %si, %ax	// "cv"
+        movw      %si, %ax  // "cv"
         lock
         cmpxchgw  %dx,(%rdi)
         sete      %al           // if %ax == (%rdi) set %al = 1 else set %al = 0
@@ -806,15 +808,15 @@ KMP_LABEL(invoke_3):
 // __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
 //
 // parameters:
-// 	p:	%rdi
-// 	cv:	%esi
-//	sv:	%edx
+//  p:  %rdi
+//  cv: %esi
+//  sv: %edx
 //
-// return:	%eax
+// return:  %eax
         .text
         PROC  __kmp_compare_and_store32
 
-        movl      %esi, %eax	// "cv"
+        movl      %esi, %eax    // "cv"
         lock
         cmpxchgl  %edx,(%rdi)
         sete      %al           // if %eax == (%rdi) set %al = 1 else set %al = 0
@@ -831,10 +833,10 @@ KMP_LABEL(invoke_3):
 // __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
 //
 // parameters:
-// 	p:	%rdi
-// 	cv:	%rsi
-//	sv:	%rdx
-//	return:	%eax
+//  p:  %rdi
+//  cv: %rsi
+//  sv: %rdx
+//  return: %eax
         .text
         PROC  __kmp_compare_and_store64
 
@@ -854,15 +856,15 @@ KMP_LABEL(invoke_3):
 // __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
 //
 // parameters:
-// 	p:	%rdi
-// 	cv:	%esi
-//	sv:	%edx
+//  p:  %rdi
+//  cv: %esi
+//  sv: %edx
 //
-// return:	%eax
+// return:  %eax
         .text
         PROC  __kmp_compare_and_store_ret8
 
-        movb      %sil, %al	// "cv"
+        movb      %sil, %al // "cv"
         lock
         cmpxchgb  %dl,(%rdi)
         ret
@@ -877,15 +879,15 @@ KMP_LABEL(invoke_3):
 // __kmp_compare_and_store16_ret( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
 //
 // parameters:
-// 	p:	%rdi
-// 	cv:	%si
-//	sv:	%dx
+//  p:  %rdi
+//  cv: %si
+//  sv: %dx
 //
-// return:	%eax
+// return:  %eax
         .text
         PROC  __kmp_compare_and_store_ret16
 
-        movw      %si, %ax	// "cv"
+        movw      %si, %ax  // "cv"
         lock
         cmpxchgw  %dx,(%rdi)
         ret
@@ -900,15 +902,15 @@ KMP_LABEL(invoke_3):
 // __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
 //
 // parameters:
-// 	p:	%rdi
-// 	cv:	%esi
-//	sv:	%edx
+//  p:  %rdi
+//  cv: %esi
+//  sv: %edx
 //
-// return:	%eax
+// return:  %eax
         .text
         PROC  __kmp_compare_and_store_ret32
 
-        movl      %esi, %eax	// "cv"
+        movl      %esi, %eax    // "cv"
         lock
         cmpxchgl  %edx,(%rdi)
         ret
@@ -923,10 +925,10 @@ KMP_LABEL(invoke_3):
 // __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
 //
 // parameters:
-// 	p:	%rdi
-// 	cv:	%rsi
-//	sv:	%rdx
-//	return:	%eax
+//  p:  %rdi
+//  cv: %rsi
+//  sv: %rdx
+//  return: %eax
         .text
         PROC  __kmp_compare_and_store_ret64
 
@@ -951,19 +953,19 @@ KMP_LABEL(invoke_3):
 // __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
 //
 // parameters:
-// 	addr:	%rdi
-// 	data:	%xmm0 (lower 4 bytes)
+//  addr:   %rdi
+//  data:   %xmm0 (lower 4 bytes)
 //
-// return:	%xmm0 (lower 4 bytes)
+// return:  %xmm0 (lower 4 bytes)
         .text
         PROC  __kmp_xchg_real32
 
-	movd	%xmm0, %eax	// load "data" to eax
+    movd    %xmm0, %eax // load "data" to eax
 
          lock
          xchgl %eax, (%rdi)
 
-	movd	%eax, %xmm0	// load old value into return register
+    movd    %eax, %xmm0 // load old value into return register
 
         ret
 
@@ -983,12 +985,12 @@ KMP_LABEL(invoke_3):
         .text
         PROC  __kmp_xchg_real64
 
-	movd	%xmm0, %rax	// load "data" to rax
+    movd    %xmm0, %rax // load "data" to rax
 
          lock
-	xchgq  %rax, (%rdi)
+    xchgq  %rax, (%rdi)
 
-	movd	%rax, %xmm0	// load old value into return register
+    movd    %rax, %xmm0 // load old value into return register
         ret
 
         DEBUG_INFO __kmp_xchg_real64
@@ -998,6 +1000,7 @@ KMP_LABEL(invoke_3):
 
 # endif /* !KMP_ASM_INTRINS */
 
+#if !KMP_USE_ABT
 //------------------------------------------------------------------------
 // int
 // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
@@ -1019,172 +1022,173 @@ KMP_LABEL(invoke_3):
 // note: at call to pkfn must have %rsp 128-byte aligned for compiler
 //
 // parameters:
-//      %rdi:  	pkfn
-//	%esi:	gtid
-//	%edx:	tid
-//	%ecx:	argc
-//	%r8:	p_argv
-//	%r9:	&exit_frame
+//      %rdi:   pkfn
+//  %esi:   gtid
+//  %edx:   tid
+//  %ecx:   argc
+//  %r8:    p_argv
+//  %r9:    &exit_frame
 //
 // locals:
-//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
-//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
+//  __gtid: gtid parm pushed on stack so can pass &gtid to pkfn
+//  __tid:  tid parm pushed on stack so can pass &tid to pkfn
 //
 // reg temps:
-//	%rax:	used all over the place
-//	%rdx:	used in stack pointer alignment calculation
-//	%r11:	used to traverse p_argv array
-//	%rsi:	used as temporary for stack parameters
-//		used as temporary for number of pkfn parms to push
-//	%rbx:	used to hold pkfn address, and zero constant, callee-save
-//
-// return:	%eax 	(always 1/TRUE)
+//  %rax:   used all over the place
+//  %rdx:   used in stack pointer alignment calculation
+//  %r11:   used to traverse p_argv array
+//  %rsi:   used as temporary for stack parameters
+//      used as temporary for number of pkfn parms to push
+//  %rbx:   used to hold pkfn address, and zero constant, callee-save
+//
+// return:  %eax    (always 1/TRUE)
 __gtid = -16
 __tid = -24
 
 // -- Begin __kmp_invoke_microtask
 // mark_begin;
         .text
-	PROC  __kmp_invoke_microtask
+    PROC  __kmp_invoke_microtask
 
-	pushq 	%rbp		// save base pointer
-	KMP_CFI_DEF_OFFSET 16
-	KMP_CFI_OFFSET rbp,-16
-	movq 	%rsp,%rbp	// establish the base pointer for this routine.
-	KMP_CFI_REGISTER rbp
+    pushq   %rbp        // save base pointer
+    KMP_CFI_DEF_OFFSET 16
+    KMP_CFI_OFFSET rbp,-16
+    movq    %rsp,%rbp   // establish the base pointer for this routine.
+    KMP_CFI_REGISTER rbp
 
 #if OMPT_SUPPORT
-	movq	%rbp, (%r9)	// save exit_frame
+    movq    %rbp, (%r9) // save exit_frame
 #endif
 
-	pushq 	%rbx		// %rbx is callee-saved register
-	pushq	%rsi		// Put gtid on stack so can pass &tgid to pkfn
-	pushq	%rdx		// Put tid on stack so can pass &tid to pkfn
+    pushq   %rbx        // %rbx is callee-saved register
+    pushq   %rsi        // Put gtid on stack so can pass &tgid to pkfn
+    pushq   %rdx        // Put tid on stack so can pass &tid to pkfn
 
-	movq	%rcx, %rax	// Stack alignment calculation begins; argc -> %rax
-	movq	$0, %rbx	// constant for cmovs later
-	subq	$4, %rax	// subtract four args passed in registers to pkfn
+    movq    %rcx, %rax  // Stack alignment calculation begins; argc -> %rax
+    movq    $0, %rbx    // constant for cmovs later
+    subq    $4, %rax    // subtract four args passed in registers to pkfn
 #if KMP_MIC
-	js	KMP_LABEL(kmp_0)	// jump to movq
-	jmp	KMP_LABEL(kmp_0_exit)	// jump ahead
+    js  KMP_LABEL(kmp_0)    // jump to movq
+    jmp KMP_LABEL(kmp_0_exit)   // jump ahead
 KMP_LABEL(kmp_0):
-	movq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
+    movq    %rbx, %rax  // zero negative value in %rax <- max(0, argc-4)
 KMP_LABEL(kmp_0_exit):
 #else
-	cmovsq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
+    cmovsq  %rbx, %rax  // zero negative value in %rax <- max(0, argc-4)
 #endif // KMP_MIC
 
-	movq	%rax, %rsi	// save max(0, argc-4) -> %rsi for later
-	shlq 	$3, %rax	// Number of bytes used on stack: max(0, argc-4)*8
+    movq    %rax, %rsi  // save max(0, argc-4) -> %rsi for later
+    shlq    $3, %rax    // Number of bytes used on stack: max(0, argc-4)*8
 
-	movq 	%rsp, %rdx	//
-	subq 	%rax, %rdx	// %rsp-(max(0,argc-4)*8) -> %rdx --
-				// without align, stack ptr would be this
-	movq 	%rdx, %rax	// Save to %rax
+    movq    %rsp, %rdx  //
+    subq    %rax, %rdx  // %rsp-(max(0,argc-4)*8) -> %rdx --
+                // without align, stack ptr would be this
+    movq    %rdx, %rax  // Save to %rax
 
-	andq 	$0xFFFFFFFFFFFFFF80, %rax  // mask off lower 7 bits (128 bytes align)
-	subq 	%rax, %rdx	// Amount to subtract from %rsp
-	subq 	%rdx, %rsp	// Prepare the stack ptr --
-				// now %rsp will align to 128-byte boundary at call site
+    andq    $0xFFFFFFFFFFFFFF80, %rax  // mask off lower 7 bits (128 bytes align)
+    subq    %rax, %rdx  // Amount to subtract from %rsp
+    subq    %rdx, %rsp  // Prepare the stack ptr --
+                // now %rsp will align to 128-byte boundary at call site
 
-				// setup pkfn parameter reg and stack
-	movq	%rcx, %rax	// argc -> %rax
-	cmpq	$0, %rsi
-	je	KMP_LABEL(kmp_invoke_pass_parms)	// jump ahead if no parms to push
-	shlq	$3, %rcx	// argc*8 -> %rcx
-	movq 	%r8, %rdx	// p_argv -> %rdx
-	addq	%rcx, %rdx	// &p_argv[argc] -> %rdx
+                // setup pkfn parameter reg and stack
+    movq    %rcx, %rax  // argc -> %rax
+    cmpq    $0, %rsi
+    je  KMP_LABEL(kmp_invoke_pass_parms)    // jump ahead if no parms to push
+    shlq    $3, %rcx    // argc*8 -> %rcx
+    movq    %r8, %rdx   // p_argv -> %rdx
+    addq    %rcx, %rdx  // &p_argv[argc] -> %rdx
 
-	movq	%rsi, %rcx	// max (0, argc-4) -> %rcx
+    movq    %rsi, %rcx  // max (0, argc-4) -> %rcx
 
 KMP_LABEL(kmp_invoke_push_parms):
-	// push nth - 7th parms to pkfn on stack
-	subq	$8, %rdx	// decrement p_argv pointer to previous parm
-	movq	(%rdx), %rsi	// p_argv[%rcx-1] -> %rsi
-	pushq	%rsi		// push p_argv[%rcx-1] onto stack (reverse order)
-	subl	$1, %ecx
+    // push nth - 7th parms to pkfn on stack
+    subq    $8, %rdx    // decrement p_argv pointer to previous parm
+    movq    (%rdx), %rsi    // p_argv[%rcx-1] -> %rsi
+    pushq   %rsi        // push p_argv[%rcx-1] onto stack (reverse order)
+    subl    $1, %ecx
 
 // C69570: "X86_64_RELOC_BRANCH not supported" error at linking on mac_32e
-//		if the name of the label that is an operand of this jecxz starts with a dot (".");
-//	   Apple's linker does not support 1-byte length relocation;
+//      if the name of the label that is an operand of this jecxz starts with a dot (".");
+//     Apple's linker does not support 1-byte length relocation;
 //         Resolution: replace all .labelX entries with L_labelX.
 
-	jecxz   KMP_LABEL(kmp_invoke_pass_parms)  // stop when four p_argv[] parms left
-	jmp	KMP_LABEL(kmp_invoke_push_parms)
-	ALIGN 3
-KMP_LABEL(kmp_invoke_pass_parms):	// put 1st - 6th parms to pkfn in registers.
-				// order here is important to avoid trashing
-				// registers used for both input and output parms!
-	movq	%rdi, %rbx	// pkfn -> %rbx
-	leaq	__gtid(%rbp), %rdi // &gtid -> %rdi (store 1st parm to pkfn)
-	leaq	__tid(%rbp), %rsi  // &tid -> %rsi (store 2nd parm to pkfn)
+    jecxz   KMP_LABEL(kmp_invoke_pass_parms)  // stop when four p_argv[] parms left
+    jmp KMP_LABEL(kmp_invoke_push_parms)
+    ALIGN 3
+KMP_LABEL(kmp_invoke_pass_parms):   // put 1st - 6th parms to pkfn in registers.
+                // order here is important to avoid trashing
+                // registers used for both input and output parms!
+    movq    %rdi, %rbx  // pkfn -> %rbx
+    leaq    __gtid(%rbp), %rdi // &gtid -> %rdi (store 1st parm to pkfn)
+    leaq    __tid(%rbp), %rsi  // &tid -> %rsi (store 2nd parm to pkfn)
 
-	movq	%r8, %r11	// p_argv -> %r11
+    movq    %r8, %r11   // p_argv -> %r11
 
 #if KMP_MIC
-	cmpq	$4, %rax	// argc >= 4?
-	jns	KMP_LABEL(kmp_4)	// jump to movq
-	jmp	KMP_LABEL(kmp_4_exit)	// jump ahead
+    cmpq    $4, %rax    // argc >= 4?
+    jns KMP_LABEL(kmp_4)    // jump to movq
+    jmp KMP_LABEL(kmp_4_exit)   // jump ahead
 KMP_LABEL(kmp_4):
-	movq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
+    movq    24(%r11), %r9   // p_argv[3] -> %r9 (store 6th parm to pkfn)
 KMP_LABEL(kmp_4_exit):
 
-	cmpq	$3, %rax	// argc >= 3?
-	jns	KMP_LABEL(kmp_3)	// jump to movq
-	jmp	KMP_LABEL(kmp_3_exit)	// jump ahead
+    cmpq    $3, %rax    // argc >= 3?
+    jns KMP_LABEL(kmp_3)    // jump to movq
+    jmp KMP_LABEL(kmp_3_exit)   // jump ahead
 KMP_LABEL(kmp_3):
-	movq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
+    movq    16(%r11), %r8   // p_argv[2] -> %r8 (store 5th parm to pkfn)
 KMP_LABEL(kmp_3_exit):
 
-	cmpq	$2, %rax	// argc >= 2?
-	jns	KMP_LABEL(kmp_2)	// jump to movq
-	jmp	KMP_LABEL(kmp_2_exit)	// jump ahead
+    cmpq    $2, %rax    // argc >= 2?
+    jns KMP_LABEL(kmp_2)    // jump to movq
+    jmp KMP_LABEL(kmp_2_exit)   // jump ahead
 KMP_LABEL(kmp_2):
-	movq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
+    movq    8(%r11), %rcx   // p_argv[1] -> %rcx (store 4th parm to pkfn)
 KMP_LABEL(kmp_2_exit):
 
-	cmpq	$1, %rax	// argc >= 1?
-	jns	KMP_LABEL(kmp_1)	// jump to movq
-	jmp	KMP_LABEL(kmp_1_exit)	// jump ahead
+    cmpq    $1, %rax    // argc >= 1?
+    jns KMP_LABEL(kmp_1)    // jump to movq
+    jmp KMP_LABEL(kmp_1_exit)   // jump ahead
 KMP_LABEL(kmp_1):
-	movq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
+    movq    (%r11), %rdx    // p_argv[0] -> %rdx (store 3rd parm to pkfn)
 KMP_LABEL(kmp_1_exit):
 #else
-	cmpq	$4, %rax	// argc >= 4?
-	cmovnsq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
+    cmpq    $4, %rax    // argc >= 4?
+    cmovnsq 24(%r11), %r9   // p_argv[3] -> %r9 (store 6th parm to pkfn)
 
-	cmpq	$3, %rax	// argc >= 3?
-	cmovnsq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
+    cmpq    $3, %rax    // argc >= 3?
+    cmovnsq 16(%r11), %r8   // p_argv[2] -> %r8 (store 5th parm to pkfn)
 
-	cmpq	$2, %rax	// argc >= 2?
-	cmovnsq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
+    cmpq    $2, %rax    // argc >= 2?
+    cmovnsq 8(%r11), %rcx   // p_argv[1] -> %rcx (store 4th parm to pkfn)
 
-	cmpq	$1, %rax	// argc >= 1?
-	cmovnsq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
+    cmpq    $1, %rax    // argc >= 1?
+    cmovnsq (%r11), %rdx    // p_argv[0] -> %rdx (store 3rd parm to pkfn)
 #endif // KMP_MIC
 
-	call	*%rbx		// call (*pkfn)();
-	movq	$1, %rax	// move 1 into return register;
+    call    *%rbx       // call (*pkfn)();
+    movq    $1, %rax    // move 1 into return register;
 
-	movq	-8(%rbp), %rbx	// restore %rbx	using %rbp since %rsp was modified
-	movq 	%rbp, %rsp	// restore stack pointer
-	popq 	%rbp		// restore frame pointer
-	KMP_CFI_DEF rsp,8
-	ret
+    movq    -8(%rbp), %rbx  // restore %rbx using %rbp since %rsp was modified
+    movq    %rbp, %rsp  // restore stack pointer
+    popq    %rbp        // restore frame pointer
+    KMP_CFI_DEF rsp,8
+    ret
 
-	DEBUG_INFO __kmp_invoke_microtask
+    DEBUG_INFO __kmp_invoke_microtask
 // -- End  __kmp_invoke_microtask
+#endif /* !KMP_USE_ABT */
 
 // kmp_uint64
 // __kmp_hardware_timestamp(void)
         .text
-	PROC  __kmp_hardware_timestamp
-	rdtsc
-	shlq    $32, %rdx
-	orq     %rdx, %rax
-	ret
+    PROC  __kmp_hardware_timestamp
+    rdtsc
+    shlq    $32, %rdx
+    orq     %rdx, %rax
+    ret
 
-	DEBUG_INFO __kmp_hardware_timestamp
+    DEBUG_INFO __kmp_hardware_timestamp
 // -- End  __kmp_hardware_timestamp
 
 //------------------------------------------------------------------------
@@ -1206,6 +1210,7 @@ KMP_LABEL(kmp_1_exit):
 // '
 #if (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
 
+#if !KMP_USE_ABT
 //------------------------------------------------------------------------
 // int
 // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
@@ -1231,26 +1236,26 @@ KMP_LABEL(kmp_1_exit):
 // }
 //
 // parameters:
-//	x0:	pkfn
-//	w1:	gtid
-//	w2:	tid
-//	w3:	argc
-//	x4:	p_argv
-//	x5:	&exit_frame
+//  x0: pkfn
+//  w1: gtid
+//  w2: tid
+//  w3: argc
+//  x4: p_argv
+//  x5: &exit_frame
 //
 // locals:
-//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
-//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
+//  __gtid: gtid parm pushed on stack so can pass &gtid to pkfn
+//  __tid:  tid parm pushed on stack so can pass &tid to pkfn
 //
 // reg temps:
-//	 x8:	used to hold pkfn address
-//	 w9:	used as temporary for number of pkfn parms
-//	x10:	used to traverse p_argv array
-//	x11:	used as temporary for stack placement calculation
-//	x12:	used as temporary for stack parameters
-//	x19:	used to preserve exit_frame_ptr, callee-save
+//   x8:    used to hold pkfn address
+//   w9:    used as temporary for number of pkfn parms
+//  x10:    used to traverse p_argv array
+//  x11:    used as temporary for stack placement calculation
+//  x12:    used as temporary for stack parameters
+//  x19:    used to preserve exit_frame_ptr, callee-save
 //
-// return:	w0	(always 1/TRUE)
+// return:  w0  (always 1/TRUE)
 //
 
 __gtid = 4
@@ -1258,80 +1263,82 @@ __tid = 8
 
 // -- Begin __kmp_invoke_microtask
 // mark_begin;
-	.text
-	PROC __kmp_invoke_microtask
+    .text
+    PROC __kmp_invoke_microtask
 
-	stp	x29, x30, [sp, #-16]!
+    stp x29, x30, [sp, #-16]!
 # if OMPT_SUPPORT
-	stp	x19, x20, [sp, #-16]!
+    stp x19, x20, [sp, #-16]!
 # endif
-	mov	x29, sp
-
-	orr	w9, wzr, #1
-	add	w9, w9, w3, lsr #1
-	sub	sp, sp, w9, uxtw #4
-	mov	x11, sp
-
-	mov	x8, x0
-	str	w1, [x29, #-__gtid]
-	str	w2, [x29, #-__tid]
-	mov	w9, w3
-	mov	x10, x4
+    mov x29, sp
+
+    orr w9, wzr, #1
+    add w9, w9, w3, lsr #1
+    sub sp, sp, w9, uxtw #4
+    mov x11, sp
+
+    mov x8, x0
+    str w1, [x29, #-__gtid]
+    str w2, [x29, #-__tid]
+    mov w9, w3
+    mov x10, x4
 # if OMPT_SUPPORT
-	mov	x19, x5
-	str	x29, [x19]
+    mov x19, x5
+    str x29, [x19]
 # endif
 
-	sub	x0, x29, #__gtid
-	sub	x1, x29, #__tid
+    sub x0, x29, #__gtid
+    sub x1, x29, #__tid
 
-	cbz	w9, KMP_LABEL(kmp_1)
-	ldr	x2, [x10]
+    cbz w9, KMP_LABEL(kmp_1)
+    ldr x2, [x10]
 
-	sub	w9, w9, #1
-	cbz	w9, KMP_LABEL(kmp_1)
-	ldr	x3, [x10, #8]!
+    sub w9, w9, #1
+    cbz w9, KMP_LABEL(kmp_1)
+    ldr x3, [x10, #8]!
 
-	sub	w9, w9, #1
-	cbz	w9, KMP_LABEL(kmp_1)
-	ldr	x4, [x10, #8]!
+    sub w9, w9, #1
+    cbz w9, KMP_LABEL(kmp_1)
+    ldr x4, [x10, #8]!
 
-	sub	w9, w9, #1
-	cbz	w9, KMP_LABEL(kmp_1)
-	ldr	x5, [x10, #8]!
+    sub w9, w9, #1
+    cbz w9, KMP_LABEL(kmp_1)
+    ldr x5, [x10, #8]!
 
-	sub	w9, w9, #1
-	cbz	w9, KMP_LABEL(kmp_1)
-	ldr	x6, [x10, #8]!
+    sub w9, w9, #1
+    cbz w9, KMP_LABEL(kmp_1)
+    ldr x6, [x10, #8]!
 
-	sub	w9, w9, #1
-	cbz	w9, KMP_LABEL(kmp_1)
-	ldr	x7, [x10, #8]!
+    sub w9, w9, #1
+    cbz w9, KMP_LABEL(kmp_1)
+    ldr x7, [x10, #8]!
 
 KMP_LABEL(kmp_0):
-	sub	w9, w9, #1
-	cbz	w9, KMP_LABEL(kmp_1)
-	ldr	x12, [x10, #8]!
-	str	x12, [x11], #8
-	b	KMP_LABEL(kmp_0)
+    sub w9, w9, #1
+    cbz w9, KMP_LABEL(kmp_1)
+    ldr x12, [x10, #8]!
+    str x12, [x11], #8
+    b   KMP_LABEL(kmp_0)
 KMP_LABEL(kmp_1):
-	blr	x8
-	orr	w0, wzr, #1
-	mov	sp, x29
+    blr x8
+    orr w0, wzr, #1
+    mov sp, x29
 # if OMPT_SUPPORT
-	str	xzr, [x19]
-	ldp	x19, x20, [sp], #16
+    str xzr, [x19]
+    ldp x19, x20, [sp], #16
 # endif
-	ldp	x29, x30, [sp], #16
-	ret
+    ldp x29, x30, [sp], #16
+    ret
 
-	DEBUG_INFO __kmp_invoke_microtask
+    DEBUG_INFO __kmp_invoke_microtask
 // -- End  __kmp_invoke_microtask
+#endif /* !KMP_USE_ABT */
 
 #endif /* (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64 */
 
 #if KMP_ARCH_PPC64
 
+#if !KMP_USE_ABT
 //------------------------------------------------------------------------
 // int
 // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
@@ -1357,45 +1364,45 @@ KMP_LABEL(kmp_1):
 // }
 //
 // parameters:
-//	r3:	pkfn
-//	r4:	gtid
-//	r5:	tid
-//	r6:	argc
-//	r7:	p_argv
-//	r8:	&exit_frame
+//  r3: pkfn
+//  r4: gtid
+//  r5: tid
+//  r6: argc
+//  r7: p_argv
+//  r8: &exit_frame
 //
-// return:	r3	(always 1/TRUE)
+// return:  r3  (always 1/TRUE)
 //
 	.text
-# if KMP_ARCH_PPC64_LE
+# if KMP_ARCH_PPC64_ELFv2
 	.abiversion 2
 # endif
-	.globl	__kmp_invoke_microtask
+    .globl  __kmp_invoke_microtask
 
-# if KMP_ARCH_PPC64_LE
+# if KMP_ARCH_PPC64_ELFv2
 	.p2align	4
 # else
-	.p2align	2
+    .p2align    2
 # endif
 
-	.type	__kmp_invoke_microtask,@function
+    .type   __kmp_invoke_microtask,@function
 
-# if KMP_ARCH_PPC64_LE
+# if KMP_ARCH_PPC64_ELFv2
 __kmp_invoke_microtask:
 .Lfunc_begin0:
 .Lfunc_gep0:
-	addis 2, 12, .TOC.-.Lfunc_gep0@ha
-	addi 2, 2, .TOC.-.Lfunc_gep0@l
+    addis 2, 12, .TOC.-.Lfunc_gep0@ha
+    addi 2, 2, .TOC.-.Lfunc_gep0@l
 .Lfunc_lep0:
-	.localentry	__kmp_invoke_microtask, .Lfunc_lep0-.Lfunc_gep0
+    .localentry __kmp_invoke_microtask, .Lfunc_lep0-.Lfunc_gep0
 # else
-	.section	.opd,"aw",@progbits
+    .section    .opd,"aw",@progbits
 __kmp_invoke_microtask:
-	.p2align	3
-	.quad	.Lfunc_begin0
-	.quad	.TOC.@tocbase
-	.quad	0
-	.text
+    .p2align    3
+    .quad   .Lfunc_begin0
+    .quad   .TOC.@tocbase
+    .quad   0
+    .text
 .Lfunc_begin0:
 # endif
 
@@ -1409,157 +1416,153 @@ __kmp_invoke_microtask:
 // and 8 bytes to store r31. With OMP-T support, we need an additional 8 bytes
 // to save r30 to hold a copy of r8.
 
-	.cfi_startproc
-	mflr 0
-	std 31, -8(1)
-	std 0, 16(1)
+    .cfi_startproc
+    mflr 0
+    std 31, -8(1)
+    std 0, 16(1)
 
 // This is unusual because normally we'd set r31 equal to r1 after the stack
 // frame is established. In this case, however, we need to dynamically compute
 // the stack frame size, and so we keep a direct copy of r1 to access our
 // register save areas and restore the r1 value before returning.
-	mr 31, 1
-	.cfi_def_cfa_register r31
-	.cfi_offset r31, -8
-	.cfi_offset lr, 16
+    mr 31, 1
+    .cfi_def_cfa_register r31
+    .cfi_offset r31, -8
+    .cfi_offset lr, 16
 
 // Compute the size necessary for the local stack frame.
-# if KMP_ARCH_PPC64_LE
+# if KMP_ARCH_PPC64_ELFv2
 	li 12, 72
 # else
-	li 12, 88
+    li 12, 88
 # endif
-	sldi 0, 6, 3
-	add 12, 0, 12
-	neg 12, 12
-
-// We need to make sure that the stack frame stays aligned (to 16 bytes, except
-// under the BG/Q CNK, where it must be to 32 bytes).
-# if KMP_OS_CNK
-	li 0, -32
-# else
+    sldi 0, 6, 3
+    add 12, 0, 12
+    neg 12, 12
+
+// We need to make sure that the stack frame stays aligned (to 16 bytes).
 	li 0, -16
-# endif
 	and 12, 0, 12
 
 // Establish the local stack frame.
-	stdux 1, 1, 12
+    stdux 1, 1, 12
 
 # if OMPT_SUPPORT
-	.cfi_offset r30, -16
-	std 30, -16(31)
-	std 1, 0(8)
-	mr 30, 8
+    .cfi_offset r30, -16
+    std 30, -16(31)
+    std 1, 0(8)
+    mr 30, 8
 # endif
 
 // Store gtid and tid to the stack because they're passed by reference to the microtask.
-	stw 4, -20(31)
-	stw 5, -24(31)
+    stw 4, -20(31)
+    stw 5, -24(31)
 
-	mr 12, 6
-	mr 4, 7
+    mr 12, 6
+    mr 4, 7
 
-	cmpwi 0, 12, 1
-	blt	 0, .Lcall
+    cmpwi 0, 12, 1
+    blt  0, .Lcall
 
-	ld 5, 0(4)
+    ld 5, 0(4)
 
-	cmpwi 0, 12, 2
-	blt	 0, .Lcall
+    cmpwi 0, 12, 2
+    blt  0, .Lcall
 
-	ld 6, 8(4)
+    ld 6, 8(4)
 
-	cmpwi 0, 12, 3
-	blt	 0, .Lcall
+    cmpwi 0, 12, 3
+    blt  0, .Lcall
 
-	ld 7, 16(4)
+    ld 7, 16(4)
 
-	cmpwi 0, 12, 4
-	blt	 0, .Lcall
+    cmpwi 0, 12, 4
+    blt  0, .Lcall
 
-	ld 8, 24(4)
+    ld 8, 24(4)
 
-	cmpwi 0, 12, 5
-	blt	 0, .Lcall
+    cmpwi 0, 12, 5
+    blt  0, .Lcall
 
-	ld 9, 32(4)
+    ld 9, 32(4)
 
-	cmpwi 0, 12, 6
-	blt	 0, .Lcall
+    cmpwi 0, 12, 6
+    blt  0, .Lcall
 
-	ld 10, 40(4)
+    ld 10, 40(4)
 
-	cmpwi 0, 12, 7
-	blt	 0, .Lcall
+    cmpwi 0, 12, 7
+    blt  0, .Lcall
 
 // There are more than 6 microtask parameters, so we need to store the
 // remainder to the stack.
-	addi 12, 12, -6
-	mtctr 12
+    addi 12, 12, -6
+    mtctr 12
 
 // These are set to 8 bytes before the first desired store address (we're using
 // pre-increment loads and stores in the loop below). The parameter save area
 // for the microtask begins 48 + 8*8 == 112 bytes above r1 for ELFv1 and
 // 32 + 8*8 == 96 bytes above r1 for ELFv2.
 	addi 4, 4, 40
-# if KMP_ARCH_PPC64_LE
+# if KMP_ARCH_PPC64_ELFv2
 	addi 12, 1, 88
 # else
-	addi 12, 1, 104
+    addi 12, 1, 104
 # endif
 
 .Lnext:
-	ldu 0, 8(4)
-	stdu 0, 8(12)
-	bdnz .Lnext
+    ldu 0, 8(4)
+    stdu 0, 8(12)
+    bdnz .Lnext
 
 .Lcall:
-# if KMP_ARCH_PPC64_LE
+# if KMP_ARCH_PPC64_ELFv2
 	std 2, 24(1)
 	mr 12, 3
 #else
-	std 2, 40(1)
+    std 2, 40(1)
 // For ELFv1, we need to load the actual function address from the function descriptor.
-	ld 12, 0(3)
-	ld 2, 8(3)
-	ld 11, 16(3)
+    ld 12, 0(3)
+    ld 2, 8(3)
+    ld 11, 16(3)
 #endif
 
-	addi 3, 31, -20
-	addi 4, 31, -24
+    addi 3, 31, -20
+    addi 4, 31, -24
 
 	mtctr 12
 	bctrl
-# if KMP_ARCH_PPC64_LE
+# if KMP_ARCH_PPC64_ELFv2
 	ld 2, 24(1)
 # else
-	ld 2, 40(1)
+    ld 2, 40(1)
 # endif
 
 # if OMPT_SUPPORT
-	li 3, 0
-	std 3, 0(30)
+    li 3, 0
+    std 3, 0(30)
 # endif
 
-	li 3, 1
+    li 3, 1
 
 # if OMPT_SUPPORT
-	ld 30, -16(31)
+    ld 30, -16(31)
 # endif
 
-	mr 1, 31
-	ld 0, 16(1)
-	ld 31, -8(1)
-	mtlr 0
-	blr
+    mr 1, 31
+    ld 0, 16(1)
+    ld 31, -8(1)
+    mtlr 0
+    blr
 
-	.long	0
-	.quad	0
+    .long   0
+    .quad   0
 .Lfunc_end0:
-	.size	__kmp_invoke_microtask, .Lfunc_end0-.Lfunc_begin0
-	.cfi_endproc
+    .size   __kmp_invoke_microtask, .Lfunc_end0-.Lfunc_begin0
+    .cfi_endproc
 
 // -- End  __kmp_invoke_microtask
+#endif /* !KMP_USE_ABT */
 
 #endif /* KMP_ARCH_PPC64 */
 
@@ -1742,14 +1745,19 @@ __kmp_unnamed_critical_addr:
 #endif /* KMP_ARCH_ARM */
 
 #if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
+#ifndef KMP_PREFIX_UNDERSCORE
+# define KMP_PREFIX_UNDERSCORE(x) x
+#endif
     .data
     .comm .gomp_critical_user_,32,8
     .data
     .align 8
-    .global __kmp_unnamed_critical_addr
-__kmp_unnamed_critical_addr:
+    .global KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr)
+KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr):
     .8byte .gomp_critical_user_
-    .size __kmp_unnamed_critical_addr,8
+#ifdef __ELF__
+    .size KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),8
+#endif
 #endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 ||
           KMP_ARCH_RISCV64 */
 
diff --git a/runtime/src/z_Linux_util.cpp b/runtime/src/z_Linux_util.cpp
index 0ee12927e..6261f615a 100644
--- a/runtime/src/z_Linux_util.cpp
+++ b/runtime/src/z_Linux_util.cpp
@@ -21,6 +21,10 @@
 #include "kmp_wait_release.h"
 #include "kmp_wrapper_getpid.h"
 
+#if KMP_USE_ABT
+#include "kmp_taskdeps.h"
+#endif
+
 #if !KMP_OS_DRAGONFLY && !KMP_OS_FREEBSD && !KMP_OS_NETBSD && !KMP_OS_OPENBSD
 #include <alloca.h>
 #endif
@@ -31,7 +35,7 @@
 #include <sys/times.h>
 #include <unistd.h>
 
-#if KMP_OS_LINUX && !KMP_OS_CNK
+#if KMP_OS_LINUX
 #include <sys/sysinfo.h>
 #if KMP_USE_FUTEX
 // We should really include <futex.h>, but that causes compatibility problems on
@@ -54,7 +58,7 @@
 #include <sys/sysctl.h>
 #include <sys/user.h>
 #include <pthread_np.h>
-#elif KMP_OS_NETBSD
+#elif KMP_OS_NETBSD || KMP_OS_OPENBSD
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #endif
@@ -70,7 +74,8 @@ struct kmp_sys_timer {
 };
 
 // Convert timespec to nanoseconds.
-#define TS2NS(timespec) (((timespec).tv_sec * 1e9) + (timespec).tv_nsec)
+#define TS2NS(timespec)                                                        \
+  (((timespec).tv_sec * (long int)1e9) + (timespec).tv_nsec)
 
 static struct kmp_sys_timer __kmp_sys_timer_data;
 
@@ -84,11 +89,13 @@ static int __kmp_init_runtime = FALSE;
 
 static int __kmp_fork_count = 0;
 
+#if !KMP_USE_ABT
 static pthread_condattr_t __kmp_suspend_cond_attr;
 static pthread_mutexattr_t __kmp_suspend_mutex_attr;
 
 static kmp_cond_align_t __kmp_wait_cv;
 static kmp_mutex_align_t __kmp_wait_mx;
+#endif
 
 kmp_uint64 __kmp_ticks_per_msec = 1000000;
 
@@ -100,6 +107,21 @@ static void __kmp_print_cond(char *buffer, kmp_cond_align_t *cond) {
 }
 #endif
 
+#if KMP_USE_ABT
+static inline ABT_pool __kmp_abt_get_pool_thread(int self_rank,
+                                                 int master_place_id, int tid,
+                                                 int num_threads, int level,
+                                                 kmp_proc_bind_t proc_bind,
+                                                 int *p_place_id);
+static inline ABT_pool __kmp_abt_get_pool_task();
+static int __kmp_abt_sched_init(ABT_sched sched, ABT_sched_config config);
+static void __kmp_abt_sched_run(ABT_sched sched);
+static int __kmp_abt_sched_free(ABT_sched sched);
+static void __kmp_abt_initialize(void);
+static void __kmp_abt_finalize(void);
+
+#endif
+
 #if ((KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED)
 
 /* Affinity support */
@@ -133,13 +155,13 @@ void __kmp_affinity_determine_capable(const char *env_var) {
   // If Linux* OS:
   // If the syscall fails or returns a suggestion for the size,
   // then we don't have to search for an appropriate size.
-  int gCode;
-  int sCode;
+  long gCode;
+  long sCode;
   unsigned char *buf;
   buf = (unsigned char *)KMP_INTERNAL_MALLOC(KMP_CPU_SET_SIZE_LIMIT);
   gCode = syscall(__NR_sched_getaffinity, 0, KMP_CPU_SET_SIZE_LIMIT, buf);
   KA_TRACE(30, ("__kmp_affinity_determine_capable: "
-                "initial getaffinity call returned %d errno = %d\n",
+                "initial getaffinity call returned %ld errno = %d\n",
                 gCode, errno));
 
   // if ((gCode < 0) && (errno == ENOSYS))
@@ -164,11 +186,11 @@ void __kmp_affinity_determine_capable(const char *env_var) {
   if (gCode > 0) { // Linux* OS only
     // The optimal situation: the OS returns the size of the buffer it expects.
     //
-    // A verification of correct behavior is that Isetaffinity on a NULL
+    // A verification of correct behavior is that setaffinity on a NULL
     // buffer with the same size fails with errno set to EFAULT.
     sCode = syscall(__NR_sched_setaffinity, 0, gCode, NULL);
     KA_TRACE(30, ("__kmp_affinity_determine_capable: "
-                  "setaffinity for mask size %d returned %d errno = %d\n",
+                  "setaffinity for mask size %ld returned %ld errno = %d\n",
                   gCode, sCode, errno));
     if (sCode < 0) {
       if (errno == ENOSYS) {
@@ -207,7 +229,7 @@ void __kmp_affinity_determine_capable(const char *env_var) {
   for (size = 1; size <= KMP_CPU_SET_SIZE_LIMIT; size *= 2) {
     gCode = syscall(__NR_sched_getaffinity, 0, size, buf);
     KA_TRACE(30, ("__kmp_affinity_determine_capable: "
-                  "getaffinity for mask size %d returned %d errno = %d\n",
+                  "getaffinity for mask size %ld returned %ld errno = %d\n",
                   size, gCode, errno));
 
     if (gCode < 0) {
@@ -239,7 +261,7 @@ void __kmp_affinity_determine_capable(const char *env_var) {
 
     sCode = syscall(__NR_sched_setaffinity, 0, gCode, NULL);
     KA_TRACE(30, ("__kmp_affinity_determine_capable: "
-                  "setaffinity for mask size %d returned %d errno = %d\n",
+                  "setaffinity for mask size %ld returned %ld errno = %d\n",
                   gCode, sCode, errno));
     if (sCode < 0) {
       if (errno == ENOSYS) { // Linux* OS only
@@ -276,7 +298,7 @@ void __kmp_affinity_determine_capable(const char *env_var) {
     }
   }
 #elif KMP_OS_FREEBSD
-  int gCode;
+  long gCode;
   unsigned char *buf;
   buf = (unsigned char *)KMP_INTERNAL_MALLOC(KMP_CPU_SET_SIZE_LIMIT);
   gCode = pthread_getaffinity_np(pthread_self(), KMP_CPU_SET_SIZE_LIMIT, reinterpret_cast<cpuset_t *>(buf));
@@ -286,7 +308,7 @@ void __kmp_affinity_determine_capable(const char *env_var) {
   if (gCode == 0) {
     KMP_AFFINITY_ENABLE(KMP_CPU_SET_SIZE_LIMIT);
     KA_TRACE(10, ("__kmp_affinity_determine_capable: "
-                  "affinity supported (mask size %d)\n"<
+                  "affinity supported (mask size %d)\n",
 		  (int)__kmp_affin_mask_size));
     KMP_INTERNAL_FREE(buf);
     return;
@@ -315,8 +337,11 @@ void __kmp_affinity_determine_capable(const char *env_var) {
 #if KMP_USE_FUTEX
 
 int __kmp_futex_determine_capable() {
+#if KMP_USE_ABT
+  return 0; // Not supported.
+#else
   int loc = 0;
-  int rc = syscall(__NR_futex, &loc, FUTEX_WAKE, 1, NULL, NULL, 0);
+  long rc = syscall(__NR_futex, &loc, FUTEX_WAKE, 1, NULL, NULL, 0);
   int retval = (rc == 0) || (errno != ENOSYS);
 
   KA_TRACE(10,
@@ -325,6 +350,7 @@ int __kmp_futex_determine_capable() {
                 retval ? "" : " not"));
 
   return retval;
+#endif
 }
 
 #endif // KMP_USE_FUTEX
@@ -456,15 +482,25 @@ void __kmp_terminate_thread(int gtid) {
 
 #ifdef KMP_CANCEL_THREADS
   KA_TRACE(10, ("__kmp_terminate_thread: kill (%d)\n", gtid));
+#if KMP_USE_ABT
+  status = ABT_thread_cancel(th->th.th_info.ds.ds_thread);
+  if (status != ABT_SUCCESS) {
+    __kmp_fatal(KMP_MSG(CantTerminateWorkerThread), KMP_ERR(status),
+                __kmp_msg_null);
+  }
+#else // KMP_USE_ABT
   status = pthread_cancel(th->th.th_info.ds.ds_thread);
   if (status != 0 && status != ESRCH) {
     __kmp_fatal(KMP_MSG(CantTerminateWorkerThread), KMP_ERR(status),
                 __kmp_msg_null);
   }
+#endif // !KMP_USE_ABT
 #endif
   KMP_YIELD(TRUE);
 } //
 
+#if !KMP_USE_ABT
+
 /* Set thread stack info according to values returned by pthread_getattr_np().
    If values are unreasonable, assume call failed and use incremental stack
    refinement method instead. Returns TRUE if the stack parameters could be
@@ -596,6 +632,80 @@ static void *__kmp_launch_worker(void *thr) {
   return exit_val;
 }
 
+#else // !KMP_USE_ABT
+
+static void __kmp_abt_create_workers_recursive(kmp_team_t *team, int start_tid,
+                                               int end_tid);
+static void __kmp_abt_join_workers_recursive(kmp_team_t *team, int start_tid,
+                                             int end_tid);
+
+static void __kmp_abt_launch_worker(void *thr) {
+  int gtid;
+  kmp_info_t *this_thr = (kmp_info_t *)thr;
+  kmp_team_t *team = this_thr->th.th_team;
+
+  gtid = this_thr->th.th_info.ds.ds_gtid;
+  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
+
+#if KMP_AFFINITY_SUPPORTED
+  __kmp_affinity_set_init_mask(gtid, FALSE);
+#endif
+
+  KMP_MB();
+
+  const int start_tid = __kmp_tid_from_gtid(gtid);
+  const int end_tid = this_thr->th.th_creation_group_end_tid;
+
+  if (end_tid - start_tid > 1)
+    __kmp_abt_create_workers_recursive(team, start_tid, end_tid);
+
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    /* It is originally set up in task_team_sync() */
+    this_thr->th.th_task_team = team->t.t_task_team[this_thr->th.th_task_state];
+  }
+  if (team && !TCR_4(__kmp_global.g.g_done)) {
+    /* run our new task */
+    if ((team->t.t_pkfn) != NULL) {
+      int rc;
+      KA_TRACE(20, ("__kmp_abt_launch_worker: T#%d(%d:%d) "
+                    "invoke microtask = %p\n",
+                    gtid, team->t.t_id, __kmp_tid_from_gtid(gtid),
+                    team->t.t_pkfn));
+      rc = team->t.t_invoke(gtid);
+      KMP_ASSERT(rc);
+      KMP_MB();
+      KA_TRACE(20, ("__kmp_abt_launch_worker: T#%d(%d:%d) "
+                    "done microtask = %p\n",
+                    gtid, team->t.t_id, __kmp_tid_from_gtid(gtid),
+                    team->t.t_pkfn));
+    }
+  }
+
+  KA_TRACE(10, ("__kmp_abt_launch_worker: T#%d done\n", gtid));
+
+  __kmp_abt_wait_child_tasks(this_thr, true, FALSE);
+  this_thr->th.th_task_team = NULL;
+
+  /* Below is for the implicit task */
+  kmp_taskdata_t *td = this_thr->th.th_current_task;
+  if (td->td_task_queue) {
+    KMP_DEBUG_ASSERT(td->td_tq_cur_size == 0);
+    KMP_INTERNAL_FREE(td->td_task_queue);
+    td->td_task_queue = NULL;
+    td->td_tq_max_size = 0;
+  }
+
+  /* This thread has been finished. Any task can use this as a parent. */
+  __kmp_abt_release_info(this_thr);
+
+  if (end_tid - start_tid > 1)
+    __kmp_abt_join_workers_recursive(team, start_tid, end_tid);
+
+  KA_TRACE(10, ("__kmp_abt_launch_worker: T#%d finish\n", gtid));
+}
+
+#endif // KMP_USE_ABT
+
 #if KMP_USE_MONITOR
 /* The monitor thread controls all of the threads in the complex */
 
@@ -770,6 +880,8 @@ static void *__kmp_launch_monitor(void *thr) {
 }
 #endif // KMP_USE_MONITOR
 
+#if !KMP_USE_ABT
+
 void __kmp_create_worker(int gtid, kmp_info_t *th, size_t stack_size) {
   pthread_t handle;
   pthread_attr_t thread_attr;
@@ -809,6 +921,7 @@ void __kmp_create_worker(int gtid, kmp_info_t *th, size_t stack_size) {
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
 #ifdef KMP_THREAD_ATTR
+
   status = pthread_attr_init(&thread_attr);
   if (status != 0) {
     __kmp_fatal(KMP_MSG(CantInitThreadAttrs), KMP_ERR(status), __kmp_msg_null);
@@ -901,8 +1014,237 @@ void __kmp_create_worker(int gtid, kmp_info_t *th, size_t stack_size) {
 
 } // __kmp_create_worker
 
+#else // KMP_USE_ABT
+
+static inline void __kmp_abt_create_workers_impl(kmp_team_t *team,
+                                                 const int self_rank,
+                                                 int start_tid, int end_tid) {
+  // tid must be start_tid.
+
+#ifdef KMP_THREAD_ATTR
+  ABT_thread_attr thread_attr = ABT_THREAD_ATTR_NULL;
+#endif
+
+  const kmp_proc_bind_t proc_bind = team->t.t_proc_bind_applied;
+  const int master_place_id = team->t.t_master_place_id;
+  const int team_level = team->t.t_level;
+  const int num_threads = team->t.t_nproc;
+
+  const int num_ways = __kmp_abt_global.fork_num_ways;
+  const int cutoff = __kmp_abt_global.fork_cutoff;
+  const int inc = ((end_tid - start_tid) < cutoff) ? 1
+                  : ((end_tid - start_tid + num_ways - 1) / num_ways);
+  KMP_DEBUG_ASSERT(self_rank != -1);
+  KMP_DEBUG_ASSERT(master_place_id != -1);
+  KMP_DEBUG_ASSERT(inc > 0);
+
+  // create / revive workers.
+  for (int f = start_tid + inc; f < end_tid; f += inc) {
+    kmp_info_t *th = team->t.t_threads[f];
+
+    // set up recursive division policy.
+    int new_creation_group_end_tid = f + inc;
+    if (f + inc > end_tid)
+      new_creation_group_end_tid = end_tid;
+
+#if KMP_BARRIER_ICV_PUSH
+    // If we create a thread, the master thread eagerly pushes it.
+    // If it has been run, the slave thread reads it from its master.
+    __kmp_init_implicit_task(team->t.t_ident, th, team, f, FALSE);
+    copy_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs,
+              &team->t.t_master_icvs);
+#endif
+
+    // [SM] th->th.th_info.ds.ds_gtid is setup in __kmp_allocate_thread
+    KMP_DEBUG_ASSERT(th->th.th_info.ds.ds_gtid == __kmp_gtid_from_tid(f, team));
+    // uber thread is created in __kmp_abt_create_uber().
+    KMP_DEBUG_ASSERT(!KMP_UBER_GTID(__kmp_gtid_from_tid(f, team)));
+
+#if KMP_STATS_ENABLED
+    int gtid = __kmp_gtid_from_tid(f, team);
+
+    // sets up worker thread stats
+    __kmp_acquire_tas_lock(&__kmp_stats_lock, gtid);
+
+    // th->th.th_stats is used to transfer thread-specific stats-pointer to
+    // __kmp_launch_worker. So when thread is created (goes into
+    // __kmp_launch_worker) it will set its thread local pointer to
+    // th->th.th_stats
+    if (!KMP_UBER_GTID(gtid)) {
+      th->th.th_stats = __kmp_stats_list->push_back(gtid);
+    } else {
+      // For root threads, __kmp_stats_thread_ptr is set in
+      // __kmp_register_root(), so set the th->th.th_stats field to it.
+      th->th.th_stats = __kmp_stats_thread_ptr;
+    }
+    __kmp_release_tas_lock(&__kmp_stats_lock, gtid);
+#endif // KMP_STATS_ENABLED
+
+    ABT_pool target;
+    int place_id = 0;
+    target = __kmp_abt_get_pool_thread(self_rank, master_place_id, f,
+                                       num_threads, team_level, proc_bind,
+                                       &place_id);
+    th->th.th_current_place_id = place_id;
+    th->th.th_creation_group_end_tid = new_creation_group_end_tid;
+
+    if (th->th.th_info.ds.ds_thread == ABT_THREAD_NULL) {
+      int status;
+      // Create threads.
+#ifdef KMP_THREAD_ATTR
+      if (thread_attr == ABT_THREAD_ATTR_NULL) {
+        status = ABT_thread_attr_create(&thread_attr);
+        KMP_ASSERT(status == ABT_SUCCESS);
+        status = ABT_thread_attr_set_stacksize(thread_attr, __kmp_stksize);
+        KMP_ASSERT(status == ABT_SUCCESS);
+      }
+#endif
+      status = ABT_thread_create(target, __kmp_abt_launch_worker, (void *)th,
+                                 thread_attr, &th->th.th_info.ds.ds_thread);
+      KMP_ASSERT(status == ABT_SUCCESS);
+    } else {
+      // Revive thread.
+      int status = ABT_thread_revive(target, __kmp_abt_launch_worker,
+                                     (void *)th, &th->th.th_info.ds.ds_thread);
+      KMP_ASSERT(status == ABT_SUCCESS);
+    }
+  }
+
+#ifdef KMP_THREAD_ATTR
+  if (thread_attr != ABT_THREAD_ATTR_NULL) {
+      int status = ABT_thread_attr_free(&thread_attr);
+      KMP_ASSERT(status == ABT_SUCCESS);
+  }
+#endif /* KMP_THREAD_ATTR */
+
+  if (inc != 1) {
+    // Create threads in a sub group.
+    int rec_start_tid = start_tid;
+    int rec_end_tid = start_tid + inc;
+    if (rec_end_tid > end_tid)
+      rec_end_tid = end_tid;
+    __kmp_abt_create_workers_impl(team, self_rank, rec_start_tid, rec_end_tid);
+  }
+}
+
+static void __kmp_abt_create_workers_recursive(kmp_team_t *team, int start_tid,
+                                               int end_tid) {
+  int self_rank;
+  ABT_xstream_self_rank(&self_rank);
+  __kmp_abt_create_workers_impl(team, self_rank, start_tid, end_tid);
+}
+
+void __kmp_abt_create_workers(kmp_team_t *team) {
+  const int team_level = team->t.t_level;
+  const int num_threads = team->t.t_nproc;
+#if KMP_BARRIER_ICV_PUSH
+  // set up the master icvs.
+  copy_icvs(&team->t.t_master_icvs,
+            &team->t.t_implicit_task_taskdata[0].td_icvs);
+#endif
+
+  // Get self_rank
+  int self_rank;
+  ABT_xstream_self_rank(&self_rank);
+
+  // Set up proc bind.
+  kmp_proc_bind_t proc_bind = proc_bind_false;
+  // Set up the affinity of the master thread.
+  kmp_proc_bind_t team_proc_bind = team->t.t_proc_bind;
+  if (team_proc_bind == proc_bind_default) {
+    // Use global setting.
+    int size = __kmp_nested_proc_bind.size;
+    if (size > (team_level - 1))
+      proc_bind = __kmp_nested_proc_bind.bind_types[team_level - 1];
+  } else if (team_proc_bind != proc_bind_intel) {
+    proc_bind = team_proc_bind;
+  }
+  team->t.t_proc_bind_applied = proc_bind;
+
+  // Obtain master place id.
+  int master_tid = team->t.t_master_tid;
+  int master_place_id;
+  if (team_level <= 1) {
+    master_place_id = 0; // master place is set to 0.
+  } else {
+    kmp_team_t *parent_team = team->t.t_parent;
+    master_place_id
+        = parent_team->t.t_threads[master_tid]->th.th_current_place_id;
+    if (master_place_id == -1) {
+      // master thread is not bound to any place.
+      // Use the current place.
+      master_place_id = __kmp_abt_global.locals[self_rank].place_id;
+    }
+  }
+  team->t.t_master_place_id = master_place_id;
+
+  int place_id;
+  __kmp_abt_get_pool_thread(self_rank, master_place_id, master_tid, num_threads,
+                            team_level, proc_bind, &place_id);
+  team->t.t_threads[0]->th.th_current_place_id = place_id;
+
+  // core.
+  __kmp_abt_create_workers_impl(team, self_rank, 0, num_threads);
+} // __kmp_abt_create_workers
+
+static inline void __kmp_abt_join_workers_impl(kmp_team_t *team, int start_tid,
+                                               int end_tid) {
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  const int num_ways = __kmp_abt_global.fork_num_ways;
+  const int cutoff = __kmp_abt_global.fork_cutoff;
+  const int inc = ((end_tid - start_tid) < cutoff) ? 1
+                  : ((end_tid - start_tid + num_ways - 1) / num_ways);
+
+  if (inc != 1) {
+    // Join threads in a sub group first.
+    int rec_start_tid = start_tid;
+    int rec_end_tid = start_tid + inc;
+    if (rec_end_tid > end_tid)
+      rec_end_tid = end_tid;
+    __kmp_abt_join_workers_recursive(team, rec_start_tid, rec_end_tid);
+  }
+
+  kmp_info_t **threads = team->t.t_threads;
+
+  /* Join Argobots ULTs here */
+  for (int f = start_tid + inc; f < end_tid; f += inc) {
+    // t_threads[0] is not joined.
+    ABT_thread ds_thread = threads[f]->th.th_info.ds.ds_thread;
+    int status = ABT_thread_join(ds_thread);
+    KMP_DEBUG_ASSERT(status == ABT_SUCCESS);
+    (void)status;
+  }
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+} // __kmp_abt_join_workers_impl
+
+static void __kmp_abt_join_workers_recursive(kmp_team_t *team, int start_tid,
+                                             int end_tid) {
+  __kmp_abt_join_workers_impl(team, start_tid, end_tid);
+}
+
+void __kmp_abt_join_workers(kmp_team_t *team) {
+  const int num_threads = team->t.t_nproc;
+  __kmp_abt_join_workers_impl(team, 0, num_threads);
+  for (int tid = 0; tid < num_threads; tid++) {
+    kmp_info_t *th = team->t.t_threads[tid];
+    // Reset th_current_task; th_current_task must be consistent when the team
+    // is reused in the future. BOLT cannot run tasks on top of implicit tasks,
+    // so such an inconsistency problem occurs.
+    th->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
+    // Reset threads so that tasks cannot use these threads.
+    KMP_DEBUG_ASSERT(th->th.th_active == FALSE);
+    if (tid != 0) {
+      th->th.th_active = TRUE;
+    }
+  }
+} // __kmp_abt_join_workers
+
+#endif /* KMP_USE_ABT */
+
 #if KMP_USE_MONITOR
 void __kmp_create_monitor(kmp_info_t *th) {
+#if !KMP_USE_ABT
   pthread_t handle;
   pthread_attr_t thread_attr;
   size_t size;
@@ -1039,17 +1381,28 @@ void __kmp_create_monitor(kmp_info_t *th) {
   KA_TRACE(10, ("__kmp_create_monitor: monitor created %#.8lx\n",
                 th->th.th_info.ds.ds_thread));
 
+#else // !KMP_USE_ABT
+
+  return; // Nothing to do
+
+#endif // KMP_USE_ABT
 } // __kmp_create_monitor
 #endif // KMP_USE_MONITOR
 
 void __kmp_exit_thread(int exit_status) {
+#if KMP_USE_ABT
+  ABT_thread_exit();
+#else
   pthread_exit((void *)(intptr_t)exit_status);
+#endif
 } // __kmp_exit_thread
 
 #if KMP_USE_MONITOR
 void __kmp_resume_monitor();
 
 void __kmp_reap_monitor(kmp_info_t *th) {
+#if !KMP_USE_ABT
+
   int status;
   void *exit_val;
 
@@ -1090,18 +1443,36 @@ void __kmp_reap_monitor(kmp_info_t *th) {
                 th->th.th_info.ds.ds_thread));
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+#else // !KMP_USE_ABT
+
+  return; // Nothing to do.
+
+#endif // KMP_USE_ABT
 }
 #endif // KMP_USE_MONITOR
 
 void __kmp_reap_worker(kmp_info_t *th) {
   int status;
+#if !KMP_USE_ABT
   void *exit_val;
+#endif
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   KA_TRACE(
       10, ("__kmp_reap_worker: try to reap T#%d\n", th->th.th_info.ds.ds_gtid));
 
+#if KMP_USE_ABT
+
+  ABT_thread ds_thread = th->th.th_info.ds.ds_thread;
+  if (ds_thread != ABT_THREAD_NULL) {
+    status = ABT_thread_free(&ds_thread);
+    KMP_ASSERT(status == ABT_SUCCESS);
+  }
+
+#else // KMP_USE_ABT
+
   status = pthread_join(th->th.th_info.ds.ds_thread, &exit_val);
 #ifdef KMP_DEBUG
   /* Don't expose these to the user until we understand when they trigger */
@@ -1119,6 +1490,8 @@ void __kmp_reap_worker(kmp_info_t *th) {
                 th->th.th_info.ds.ds_gtid));
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+#endif // !KMP_USE_ABT
 }
 
 #if KMP_HANDLE_SIGNALS
@@ -1149,6 +1522,7 @@ static void __kmp_team_handler(int signo) {
       if (__kmp_debug_buf) {
         __kmp_dump_debug_buffer();
       }
+      __kmp_unregister_library(); // cleanup shared memory
       KMP_MB(); // Flush all pending memory write invalidates.
       TCW_4(__kmp_global.g.g_abort, signo);
       KMP_MB(); // Flush all pending memory write invalidates.
@@ -1271,8 +1645,8 @@ static void __kmp_atfork_prepare(void) {
 }
 
 static void __kmp_atfork_parent(void) {
-  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
 }
 
 /* Reset the library so execution in the child starts "all over again" with
@@ -1280,6 +1654,7 @@ static void __kmp_atfork_parent(void) {
    allocated by parent, just abandon it to be safe. */
 static void __kmp_atfork_child(void) {
   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
   /* TODO make sure this is done right for nested/sibling */
   // ATT:  Memory leaks are here? TODO: Check it and fix.
   /* KMP_ASSERT( 0 ); */
@@ -1287,7 +1662,7 @@ static void __kmp_atfork_child(void) {
   ++__kmp_fork_count;
 
 #if KMP_AFFINITY_SUPPORTED
-#if KMP_OS_LINUX
+#if KMP_OS_LINUX || KMP_OS_FREEBSD
   // reset the affinity in the child to the initial thread
   // affinity in the parent
   kmp_set_thread_affinity_mask_initial();
@@ -1301,7 +1676,6 @@ static void __kmp_atfork_child(void) {
   }
 #endif // KMP_AFFINITY_SUPPORTED
 
-  __kmp_init_runtime = FALSE;
 #if KMP_USE_MONITOR
   __kmp_init_monitor = 0;
 #endif
@@ -1354,6 +1728,8 @@ static void __kmp_atfork_child(void) {
   __kmp_itt_reset(); // reset ITT's global state
 #endif /* USE_ITT_BUILD */
 
+  __kmp_serial_initialize();
+
   /* This is necessary to make sure no stale data is left around */
   /* AC: customers complain that we use unsafe routines in the atfork
      handler. Mathworks: dlsym() is unsafe. We call dlsym and dlopen
@@ -1375,15 +1751,22 @@ void __kmp_register_atfork(void) {
 }
 
 void __kmp_suspend_initialize(void) {
+#if KMP_USE_ABT
+  /* BOLT does not need to initialize them. */
+#else
   int status;
   status = pthread_mutexattr_init(&__kmp_suspend_mutex_attr);
   KMP_CHECK_SYSFAIL("pthread_mutexattr_init", status);
   status = pthread_condattr_init(&__kmp_suspend_cond_attr);
   KMP_CHECK_SYSFAIL("pthread_condattr_init", status);
+#endif
 }
 
 void __kmp_suspend_initialize_thread(kmp_info_t *th) {
   ANNOTATE_HAPPENS_AFTER(&th->th.th_suspend_init_count);
+#if KMP_USE_ABT
+  /* BOLT does not need to initialize them. */
+#else
   int old_value = KMP_ATOMIC_LD_RLX(&th->th.th_suspend_init_count);
   int new_value = __kmp_fork_count + 1;
   // Return if already initialized
@@ -1408,10 +1791,14 @@ void __kmp_suspend_initialize_thread(kmp_info_t *th) {
     KMP_ATOMIC_ST_REL(&th->th.th_suspend_init_count, new_value);
     ANNOTATE_HAPPENS_BEFORE(&th->th.th_suspend_init_count);
   }
+#endif
 }
 
 void __kmp_suspend_uninitialize_thread(kmp_info_t *th) {
-  if (KMP_ATOMIC_LD_ACQ(&th->th.th_suspend_init_count) > __kmp_fork_count) {
+#if KMP_USE_ABT
+  /* BOLT does not need to initialize them. */
+#else
+  if (th->th.th_suspend_init_count > __kmp_fork_count) {
     /* this means we have initialize the suspension pthread objects for this
        thread in this instance of the process */
     int status;
@@ -1428,23 +1815,33 @@ void __kmp_suspend_uninitialize_thread(kmp_info_t *th) {
     KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&th->th.th_suspend_init_count) ==
                      __kmp_fork_count);
   }
+#endif
 }
 
 // return true if lock obtained, false otherwise
 int __kmp_try_suspend_mx(kmp_info_t *th) {
+#if KMP_USE_ABT
+  return 1;
+#else
   return (pthread_mutex_trylock(&th->th.th_suspend_mx.m_mutex) == 0);
+#endif
 }
 
 void __kmp_lock_suspend_mx(kmp_info_t *th) {
+#if !KMP_USE_ABT
   int status = pthread_mutex_lock(&th->th.th_suspend_mx.m_mutex);
   KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+#endif
 }
 
 void __kmp_unlock_suspend_mx(kmp_info_t *th) {
+#if !KMP_USE_ABT
   int status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
   KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+#endif
 }
 
+#if !KMP_USE_ABT
 /* This routine puts the calling thread to sleep after setting the
    sleep bit for the indicated flag variable to true. */
 template <class C>
@@ -1459,8 +1856,7 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
 
   __kmp_suspend_initialize_thread(th);
 
-  status = pthread_mutex_lock(&th->th.th_suspend_mx.m_mutex);
-  KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+  __kmp_lock_suspend_mx(th);
 
   KF_TRACE(10, ("__kmp_suspend_template: T#%d setting sleep bit for spin(%p)\n",
                 th_gtid, flag->get()));
@@ -1471,8 +1867,7 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
       __kmp_pause_status != kmp_soft_paused) {
     flag->unset_sleeping();
-    status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
-    KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+    __kmp_unlock_suspend_mx(th);
     return;
   }
   KF_TRACE(5, ("__kmp_suspend_template: T#%d set sleep bit for spin(%p)==%x,"
@@ -1535,7 +1930,7 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
                     th_gtid));
       status = pthread_cond_wait(&th->th.th_suspend_cv.c_cond,
                                  &th->th.th_suspend_mx.m_mutex);
-#endif
+#endif // USE_SUSPEND_TIMEOUT
 
       if ((status != 0) && (status != EINTR) && (status != ETIMEDOUT)) {
         KMP_SYSFAIL("pthread_cond_wait", status);
@@ -1575,21 +1970,26 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
   }
 #endif
 
-  status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
-  KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+  __kmp_unlock_suspend_mx(th);
   KF_TRACE(30, ("__kmp_suspend_template: T#%d exit\n", th_gtid));
 }
 
-void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag) {
+template <bool C, bool S>
+void __kmp_suspend_32(int th_gtid, kmp_flag_32<C, S> *flag) {
   __kmp_suspend_template(th_gtid, flag);
 }
-void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag) {
+template <bool C, bool S>
+void __kmp_suspend_64(int th_gtid, kmp_flag_64<C, S> *flag) {
   __kmp_suspend_template(th_gtid, flag);
 }
 void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) {
   __kmp_suspend_template(th_gtid, flag);
 }
 
+template void __kmp_suspend_32<false, false>(int, kmp_flag_32<false, false> *);
+template void __kmp_suspend_64<false, true>(int, kmp_flag_64<false, true> *);
+template void __kmp_suspend_64<true, false>(int, kmp_flag_64<true, false> *);
+
 /* This routine signals the thread specified by target_gtid to wake up
    after setting the sleep bit indicated by the flag argument to FALSE.
    The target thread must already have called __kmp_suspend_template() */
@@ -1609,8 +2009,7 @@ static inline void __kmp_resume_template(int target_gtid, C *flag) {
 
   __kmp_suspend_initialize_thread(th);
 
-  status = pthread_mutex_lock(&th->th.th_suspend_mx.m_mutex);
-  KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+  __kmp_lock_suspend_mx(th);
 
   if (!flag) { // coming from __kmp_null_resume_wrapper
     flag = (C *)CCAST(void *, th->th.th_sleep_loc);
@@ -1619,13 +2018,11 @@ static inline void __kmp_resume_template(int target_gtid, C *flag) {
   // First, check if the flag is null or its type has changed. If so, someone
   // else woke it up.
   if (!flag || flag->get_type() != flag->get_ptr_type()) { // get_ptr_type
-    // simply shows what
-    // flag was cast to
+    // simply shows what flag was cast to
     KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already "
                  "awake: flag(%p)\n",
                  gtid, target_gtid, NULL));
-    status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
-    KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+    __kmp_unlock_suspend_mx(th);
     return;
   } else { // if multiple threads are sleeping, flag should be internally
     // referring to a specific thread here
@@ -1635,8 +2032,7 @@ static inline void __kmp_resume_template(int target_gtid, C *flag) {
                    "awake: flag(%p): "
                    "%u => %u\n",
                    gtid, target_gtid, flag->get(), old_spin, flag->load()));
-      status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
-      KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+      __kmp_unlock_suspend_mx(th);
       return;
     }
     KF_TRACE(5, ("__kmp_resume_template: T#%d about to wakeup T#%d, reset "
@@ -1656,23 +2052,27 @@ static inline void __kmp_resume_template(int target_gtid, C *flag) {
 #endif
   status = pthread_cond_signal(&th->th.th_suspend_cv.c_cond);
   KMP_CHECK_SYSFAIL("pthread_cond_signal", status);
-  status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
-  KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+  __kmp_unlock_suspend_mx(th);
   KF_TRACE(30, ("__kmp_resume_template: T#%d exiting after signaling wake up"
                 " for T#%d\n",
                 gtid, target_gtid));
 }
 
-void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag) {
+template <bool C, bool S>
+void __kmp_resume_32(int target_gtid, kmp_flag_32<C, S> *flag) {
   __kmp_resume_template(target_gtid, flag);
 }
-void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag) {
+template <bool C, bool S>
+void __kmp_resume_64(int target_gtid, kmp_flag_64<C, S> *flag) {
   __kmp_resume_template(target_gtid, flag);
 }
 void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) {
   __kmp_resume_template(target_gtid, flag);
 }
 
+template void __kmp_resume_32<false, true>(int, kmp_flag_32<false, true> *);
+template void __kmp_resume_64<false, true>(int, kmp_flag_64<false, true> *);
+
 #if KMP_USE_MONITOR
 void __kmp_resume_monitor() {
   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_resume);
@@ -1703,9 +2103,31 @@ void __kmp_resume_monitor() {
 }
 #endif // KMP_USE_MONITOR
 
-void __kmp_yield() { sched_yield(); }
+#endif // !KMP_USE_ABT
+
+void __kmp_yield() {
+#if KMP_USE_ABT
+  ABT_thread_yield();
+#else
+  sched_yield();
+#endif
+}
 
 void __kmp_gtid_set_specific(int gtid) {
+#if KMP_USE_ABT
+  ABT_thread self;
+  kmp_info_t *th;
+  KMP_ASSERT(__kmp_init_runtime);
+  ABT_thread_self(&self);
+
+  if (self != ABT_THREAD_NULL) {
+    ABT_thread_get_arg(self, (void **)&th);
+    KMP_ASSERT(th != NULL);
+    th->th.th_info.ds.ds_gtid = gtid;
+    KMP_ASSERT(__kmp_init_gtid);
+    return;
+  }
+#endif // KMP_USE_ABT
   if (__kmp_init_gtid) {
     int status;
     status = pthread_setspecific(__kmp_gtid_threadprivate_key,
@@ -1718,6 +2140,29 @@ void __kmp_gtid_set_specific(int gtid) {
 
 int __kmp_gtid_get_specific() {
   int gtid;
+#if KMP_USE_ABT
+
+  ABT_thread self;
+  ABT_thread_self(&self);
+  if (self == ABT_THREAD_NULL) {
+    KMP_ASSERT(__kmp_init_gtid);
+    /* External threads might call OpenMP functions. */
+    gtid = (int)(size_t)pthread_getspecific(__kmp_gtid_threadprivate_key);
+    KA_TRACE(50, ("__kmp_gtid_get_specific: key:%d gtid:%d\n",
+                  __kmp_gtid_threadprivate_key, gtid));
+  } else {
+    kmp_info_t *th;
+    ABT_thread_get_arg(self, (void **)&th);
+    if (th == NULL) {
+      gtid = KMP_GTID_DNE;
+    } else {
+      gtid = th->th.th_info.ds.ds_gtid;
+    }
+    KA_TRACE(50, ("__kmp_gtid_get_specific: ULT:%p gtid:%d\n", self, gtid));
+  }
+
+#else // KMP_USE_ABT
+
   if (!__kmp_init_gtid) {
     KA_TRACE(50, ("__kmp_gtid_get_specific: runtime shutdown, returning "
                   "KMP_GTID_SHUTDOWN\n"));
@@ -1731,6 +2176,8 @@ int __kmp_gtid_get_specific() {
   }
   KA_TRACE(50, ("__kmp_gtid_get_specific: key:%d gtid:%d\n",
                 __kmp_gtid_threadprivate_key, gtid));
+
+#endif // !KMP_USE_ABT
   return gtid;
 }
 
@@ -1740,7 +2187,8 @@ double __kmp_read_cpu_time(void) {
 
   /*t =*/times(&buffer);
 
-  return (buffer.tms_utime + buffer.tms_cutime) / (double)CLOCKS_PER_SEC;
+  return (double)(buffer.tms_utime + buffer.tms_cutime) /
+         (double)CLOCKS_PER_SEC;
 }
 
 int __kmp_read_system_info(struct kmp_sys_info *info) {
@@ -1781,7 +2229,7 @@ void __kmp_read_system_time(double *delta) {
   status = gettimeofday(&tval, NULL);
   KMP_CHECK_SYSFAIL_ERRNO("gettimeofday", status);
   TIMEVAL_TO_TIMESPEC(&tval, &stop);
-  t_ns = TS2NS(stop) - TS2NS(__kmp_sys_timer_data.start);
+  t_ns = (double)(TS2NS(stop) - TS2NS(__kmp_sys_timer_data.start));
   *delta = (t_ns * 1e-9);
 }
 
@@ -1800,7 +2248,7 @@ static int __kmp_get_xproc(void) {
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
         KMP_OS_OPENBSD || KMP_OS_HURD
 
-  r = sysconf(_SC_NPROCESSORS_ONLN);
+  __kmp_type_convert(sysconf(_SC_NPROCESSORS_ONLN), &(r));
 
 #elif KMP_OS_DARWIN
 
@@ -1846,8 +2294,10 @@ int __kmp_read_from_file(char const *path, char const *format, ...) {
 
 void __kmp_runtime_initialize(void) {
   int status;
+#if !KMP_USE_ABT
   pthread_mutexattr_t mutex_attr;
   pthread_condattr_t cond_attr;
+#endif
 
   if (__kmp_init_runtime) {
     return;
@@ -1875,7 +2325,7 @@ void __kmp_runtime_initialize(void) {
   if (sysconf(_SC_THREADS)) {
 
     /* Query the maximum number of threads */
-    __kmp_sys_max_nth = sysconf(_SC_THREAD_THREADS_MAX);
+    __kmp_type_convert(sysconf(_SC_THREAD_THREADS_MAX), &(__kmp_sys_max_nth));
     if (__kmp_sys_max_nth == -1) {
       /* Unlimited threads for NPTL */
       __kmp_sys_max_nth = INT_MAX;
@@ -1897,6 +2347,9 @@ void __kmp_runtime_initialize(void) {
   status = pthread_key_create(&__kmp_gtid_threadprivate_key,
                               __kmp_internal_end_dest);
   KMP_CHECK_SYSFAIL("pthread_key_create", status);
+#if KMP_USE_ABT
+  __kmp_abt_initialize();
+#else
   status = pthread_mutexattr_init(&mutex_attr);
   KMP_CHECK_SYSFAIL("pthread_mutexattr_init", status);
   status = pthread_mutex_init(&__kmp_wait_mx.m_mutex, &mutex_attr);
@@ -1905,6 +2358,7 @@ void __kmp_runtime_initialize(void) {
   KMP_CHECK_SYSFAIL("pthread_condattr_init", status);
   status = pthread_cond_init(&__kmp_wait_cv.c_cond, &cond_attr);
   KMP_CHECK_SYSFAIL("pthread_cond_init", status);
+#endif
 #if USE_ITT_BUILD
   __kmp_itt_initialize();
 #endif /* USE_ITT_BUILD */
@@ -1926,6 +2380,9 @@ void __kmp_runtime_destroy(void) {
   status = pthread_key_delete(__kmp_gtid_threadprivate_key);
   KMP_CHECK_SYSFAIL("pthread_key_delete", status);
 
+#if KMP_USE_ABT
+  __kmp_abt_finalize();
+#else
   status = pthread_mutex_destroy(&__kmp_wait_mx.m_mutex);
   if (status != 0 && status != EBUSY) {
     KMP_SYSFAIL("pthread_mutex_destroy", status);
@@ -1934,6 +2391,7 @@ void __kmp_runtime_destroy(void) {
   if (status != 0 && status != EBUSY) {
     KMP_SYSFAIL("pthread_cond_destroy", status);
   }
+#endif
 #if KMP_AFFINITY_SUPPORTED
   __kmp_affinity_uninitialize();
 #endif
@@ -1989,7 +2447,7 @@ void __kmp_initialize_system_tick() {
   nsec2 = __kmp_now_nsec();
   diff = nsec2 - nsec;
   if (diff > 0) {
-    kmp_uint64 tpms = (kmp_uint64)(1e6 * (delay + (now - goal)) / diff);
+    kmp_uint64 tpms = ((kmp_uint64)1e6 * (delay + (now - goal)) / diff);
     if (tpms > 0)
       __kmp_ticks_per_msec = tpms;
   }
@@ -2130,9 +2588,36 @@ int __kmp_is_address_mapped(void *addr) {
     }
   }
   KMP_INTERNAL_FREE(kiv);
-#elif KMP_OS_DRAGONFLY || KMP_OS_OPENBSD
+#elif KMP_OS_OPENBSD
+
+  int mib[3];
+  mib[0] = CTL_KERN;
+  mib[1] = KERN_PROC_VMMAP;
+  mib[2] = getpid();
 
-  // FIXME(DragonFly, OpenBSD): Implement this
+  size_t size;
+  uint64_t end;
+  rc = sysctl(mib, 3, NULL, &size, NULL, 0);
+  KMP_ASSERT(!rc);
+  KMP_ASSERT(size);
+  end = size;
+
+  struct kinfo_vmentry kiv = {.kve_start = 0};
+
+  while ((rc = sysctl(mib, 3, &kiv, &size, NULL, 0)) == 0) {
+    KMP_ASSERT(size);
+    if (kiv.kve_end == end)
+      break;
+
+    if (kiv.kve_start >= (uint64_t)addr && kiv.kve_end <= (uint64_t)addr) {
+      found = 1;
+      break;
+    }
+    kiv.kve_start += 1;
+  }
+#elif KMP_OS_DRAGONFLY
+
+  // FIXME(DragonFly): Implement this
   found = 1;
 
 #else
@@ -2164,13 +2649,13 @@ int __kmp_get_load_balance(int max) {
   // getloadavg() may return the number of samples less than requested that is
   // less than 3.
   if (__kmp_load_balance_interval < 180 && (res >= 1)) {
-    ret_avg = averages[0]; // 1 min
+    ret_avg = (int)averages[0]; // 1 min
   } else if ((__kmp_load_balance_interval >= 180 &&
               __kmp_load_balance_interval < 600) &&
              (res >= 2)) {
-    ret_avg = averages[1]; // 5 min
+    ret_avg = (int)averages[1]; // 5 min
   } else if ((__kmp_load_balance_interval >= 600) && (res == 3)) {
-    ret_avg = averages[2]; // 15 min
+    ret_avg = (int)averages[2]; // 15 min
   } else { // Error occurred
     return -1;
   }
@@ -2180,14 +2665,14 @@ int __kmp_get_load_balance(int max) {
 
 #else // Linux* OS
 
-// The fuction returns number of running (not sleeping) threads, or -1 in case
+// The function returns number of running (not sleeping) threads, or -1 in case
 // of error. Error could be reported if Linux* OS kernel too old (without
 // "/proc" support). Counting running threads stops if max running threads
 // encountered.
 int __kmp_get_load_balance(int max) {
   static int permanent_error = 0;
   static int glb_running_threads = 0; // Saved count of the running threads for
-  // the thread balance algortihm
+  // the thread balance algorithm
   static double glb_call_time = 0; /* Thread balance algorithm call time */
 
   int running_threads = 0; // Number of running threads in the system.
@@ -2295,7 +2780,7 @@ int __kmp_get_load_balance(int max) {
           if (proc_entry->d_type == DT_DIR && isdigit(task_entry->d_name[0])) {
             ++total_threads;
 
-            // Consruct complete stat file path. Easiest way would be:
+            // Construct complete stat file path. Easiest way would be:
             //  __kmp_str_buf_print( & stat_path, "%s/%s/stat", task_path.str,
             //  task_entry->d_name );
             // but seriae of __kmp_str_buf_cat works a bit faster.
@@ -2337,7 +2822,7 @@ int __kmp_get_load_balance(int max) {
                   -- ln
               */
               char buffer[65];
-              int len;
+              ssize_t len;
               len = read(stat_file, buffer, sizeof(buffer) - 1);
               if (len >= 0) {
                 buffer[len] = 0;
@@ -2400,7 +2885,7 @@ int __kmp_get_load_balance(int max) {
 
 #endif // USE_LOAD_BALANCE
 
-#if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC ||                            \
+#if KMP_USE_ABT || !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC ||             \
       ((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) ||                 \
       KMP_ARCH_PPC64 || KMP_ARCH_RISCV64)
 
@@ -2484,6 +2969,505 @@ int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
             p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
             p_argv[11], p_argv[12], p_argv[13], p_argv[14]);
     break;
+  case 16:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15]);
+    break;
+  case 17:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16]);
+    break;
+  case 18:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17]);
+    break;
+  case 19:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18]);
+    break;
+  case 20:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19]);
+    break;
+  case 21:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20]);
+    break;
+  case 22:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21]);
+    break;
+  case 23:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22]);
+    break;
+  case 24:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23]);
+    break;
+  case 25:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24]);
+    break;
+  case 26:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25]);
+    break;
+  case 27:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26]);
+    break;
+  case 28:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27]);
+    break;
+  case 29:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28]);
+    break;
+  case 30:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29]);
+    break;
+  case 31:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30]);
+    break;
+  case 32:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31]);
+    break;
+  case 33:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32]);
+    break;
+  case 34:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33]);
+    break;
+  case 35:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34]);
+    break;
+  case 36:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35]);
+    break;
+  case 37:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36]);
+    break;
+  case 38:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37]);
+    break;
+  case 39:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38]);
+    break;
+  case 40:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39]);
+    break;
+  case 41:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40]);
+    break;
+  case 42:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41]);
+    break;
+  case 43:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42]);
+    break;
+  case 44:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43]);
+    break;
+  case 45:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44]);
+    break;
+  case 46:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44], p_argv[45]);
+    break;
+  case 47:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44], p_argv[45],
+            p_argv[46]);
+    break;
+  case 48:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44], p_argv[45],
+            p_argv[46], p_argv[47]);
+    break;
+  case 49:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44], p_argv[45],
+            p_argv[46], p_argv[47], p_argv[48]);
+    break;
+  case 50:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44], p_argv[45],
+            p_argv[46], p_argv[47], p_argv[48], p_argv[49]);
+    break;
+  case 51:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44], p_argv[45],
+            p_argv[46], p_argv[47], p_argv[48], p_argv[49], p_argv[50]);
+    break;
+  case 52:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44], p_argv[45],
+            p_argv[46], p_argv[47], p_argv[48], p_argv[49], p_argv[50],
+            p_argv[51]);
+    break;
+  case 53:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44], p_argv[45],
+            p_argv[46], p_argv[47], p_argv[48], p_argv[49], p_argv[50],
+            p_argv[51], p_argv[52]);
+    break;
+  case 54:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44], p_argv[45],
+            p_argv[46], p_argv[47], p_argv[48], p_argv[49], p_argv[50],
+            p_argv[51], p_argv[52], p_argv[53]);
+    break;
+  case 55:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44], p_argv[45],
+            p_argv[46], p_argv[47], p_argv[48], p_argv[49], p_argv[50],
+            p_argv[51], p_argv[52], p_argv[53], p_argv[54]);
+    break;
+  case 56:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44], p_argv[45],
+            p_argv[46], p_argv[47], p_argv[48], p_argv[49], p_argv[50],
+            p_argv[51], p_argv[52], p_argv[53], p_argv[54], p_argv[55]);
+    break;
+  case 57:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44], p_argv[45],
+            p_argv[46], p_argv[47], p_argv[48], p_argv[49], p_argv[50],
+            p_argv[51], p_argv[52], p_argv[53], p_argv[54], p_argv[55],
+            p_argv[56]);
+    break;
+  case 58:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44], p_argv[45],
+            p_argv[46], p_argv[47], p_argv[48], p_argv[49], p_argv[50],
+            p_argv[51], p_argv[52], p_argv[53], p_argv[54], p_argv[55],
+            p_argv[56], p_argv[57]);
+    break;
+  case 59:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44], p_argv[45],
+            p_argv[46], p_argv[47], p_argv[48], p_argv[49], p_argv[50],
+            p_argv[51], p_argv[52], p_argv[53], p_argv[54], p_argv[55],
+            p_argv[56], p_argv[57], p_argv[58]);
+    break;
+  case 60:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44], p_argv[45],
+            p_argv[46], p_argv[47], p_argv[48], p_argv[49], p_argv[50],
+            p_argv[51], p_argv[52], p_argv[53], p_argv[54], p_argv[55],
+            p_argv[56], p_argv[57], p_argv[58], p_argv[59]);
+    break;
+  case 61:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44], p_argv[45],
+            p_argv[46], p_argv[47], p_argv[48], p_argv[49], p_argv[50],
+            p_argv[51], p_argv[52], p_argv[53], p_argv[54], p_argv[55],
+            p_argv[56], p_argv[57], p_argv[58], p_argv[59], p_argv[60]);
+    break;
+  case 62:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44], p_argv[45],
+            p_argv[46], p_argv[47], p_argv[48], p_argv[49], p_argv[50],
+            p_argv[51], p_argv[52], p_argv[53], p_argv[54], p_argv[55],
+            p_argv[56], p_argv[57], p_argv[58], p_argv[59], p_argv[60],
+            p_argv[61]);
+    break;
+  case 63:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44], p_argv[45],
+            p_argv[46], p_argv[47], p_argv[48], p_argv[49], p_argv[50],
+            p_argv[51], p_argv[52], p_argv[53], p_argv[54], p_argv[55],
+            p_argv[56], p_argv[57], p_argv[58], p_argv[59], p_argv[60],
+            p_argv[61], p_argv[62]);
+    break;
+  case 64:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14], p_argv[15],
+            p_argv[16], p_argv[17], p_argv[18], p_argv[19], p_argv[20],
+            p_argv[21], p_argv[22], p_argv[23], p_argv[24], p_argv[25],
+            p_argv[26], p_argv[27], p_argv[28], p_argv[29], p_argv[30],
+            p_argv[31], p_argv[32], p_argv[33], p_argv[34], p_argv[35],
+            p_argv[36], p_argv[37], p_argv[38], p_argv[39], p_argv[40],
+            p_argv[41], p_argv[42], p_argv[43], p_argv[44], p_argv[45],
+            p_argv[46], p_argv[47], p_argv[48], p_argv[49], p_argv[50],
+            p_argv[51], p_argv[52], p_argv[53], p_argv[54], p_argv[55],
+            p_argv[56], p_argv[57], p_argv[58], p_argv[59], p_argv[60],
+            p_argv[61], p_argv[62], p_argv[63]);
   }
 
   return 1;
@@ -2491,4 +3475,809 @@ int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
 
 #endif
 
+#if KMP_USE_ABT
+
+// self_rank and master_place_id must be specified.
+static inline ABT_pool __kmp_abt_get_pool_thread(int self_rank,
+                                                 int master_place_id, int tid,
+                                                 int num_threads, int level,
+                                                 kmp_proc_bind_t proc_bind,
+                                                 int *p_place_id) {
+  KMP_DEBUG_ASSERT(self_rank >= 0);
+  KMP_DEBUG_ASSERT(master_place_id >= 0);
+  KMP_DEBUG_ASSERT(level >= 0);
+  KMP_DEBUG_ASSERT(tid >= 0);
+  if (level == 0) {
+    // The initial thread must be bound to the first place unless proc_bind is
+    // proc_bind_false
+    if (proc_bind == proc_bind_false || proc_bind == proc_bind_unset) {
+      // Push to a shared pool.
+      *p_place_id = -1;
+      return __kmp_abt_global.locals[self_rank].shared_pool;
+    } else {
+      // Push to the first place pool.
+      *p_place_id = 0;
+      return __kmp_abt_global.place_pools[0];
+    }
+  } else {
+    switch (proc_bind) {
+      case proc_bind_unset:
+        // Push to a shared pool.
+        *p_place_id = -1;
+        return __kmp_abt_global.locals[self_rank].shared_pool;
+      case proc_bind_close: {
+        const int num_places = __kmp_abt_global.num_places;
+        int push_place_id;
+        KMP_DEBUG_ASSERT(master_place_id != -1);
+        if (num_threads <= num_places) {
+          push_place_id = master_place_id + tid;
+        } else {
+          push_place_id = master_place_id + (tid * num_places) / num_threads;
+        }
+        push_place_id = (push_place_id >= num_places)
+                        ? (push_place_id - num_places) : push_place_id;
+        *p_place_id = push_place_id;
+        return __kmp_abt_global.place_pools[push_place_id];
+      }
+      case proc_bind_master: {
+        // Use master pool.
+        int place_id = __kmp_abt_global.locals[self_rank].place_id;
+        ABT_pool place_pool = __kmp_abt_global.locals[self_rank].place_pool;
+        *p_place_id = place_id;
+        return place_pool;
+      }
+      case proc_bind_spread:
+      case proc_bind_true: {
+        // Push to a place pool.
+        const int num_places = __kmp_abt_global.num_places;
+        int push_place_id = master_place_id + (tid * num_places) / num_threads;
+        push_place_id = (push_place_id >= num_places)
+                        ? (push_place_id - num_places) : push_place_id;
+        *p_place_id = push_place_id;
+        return __kmp_abt_global.place_pools[push_place_id];
+      }
+      case proc_bind_false:
+      default: {
+        // Push to a shared pool.
+        *p_place_id = -1;
+        return __kmp_abt_global.locals[self_rank].shared_pool;
+      }
+    }
+  }
+}
+
+static inline ABT_pool __kmp_abt_get_pool_task() {
+  int self_rank;
+  ABT_xstream_self_rank(&self_rank);
+  return __kmp_abt_global.locals[self_rank].shared_pool;
+}
+
+void __kmp_abt_release_info(kmp_info_t *th) {
+  KMP_DEBUG_ASSERT(th->th.th_active == TRUE);
+  TCW_4(th->th.th_active, FALSE);
+}
+
+void __kmp_abt_acquire_info_for_task(kmp_info_t *th, kmp_taskdata_t *taskdata,
+                                     const kmp_team_t *match_team, int atomic) {
+  if (atomic) {
+    while (1) {
+      // task must be executed by an inactive thread belonging to the same team;
+      // if not, yield to a scheduler.
+
+      // Quick check.
+      if (th->th.th_team != match_team)
+        goto END_WHILE;
+      // Take a lock.
+      if (KMP_COMPARE_AND_STORE_RET32(&th->th.th_active, FALSE, TRUE) != FALSE)
+        goto END_WHILE;
+      // th->th.th_team might have been updated while taking a lock; if th_team
+      // is not matched, yield to a scheduler.
+      if (th->th.th_team != match_team) {
+        __kmp_abt_release_info(th);
+        goto END_WHILE;
+      }
+      break;
+  END_WHILE:
+      ABT_thread_yield();
+    }
+  } else {
+    KMP_DEBUG_ASSERT(th->th.th_active == FALSE);
+    th->th.th_active = TRUE;
+  }
+  th->th.th_current_task = taskdata;
+}
+
+void __kmp_abt_set_self_info(kmp_info_t *th) {
+  ABT_thread self;
+
+  KMP_ASSERT(__kmp_init_runtime);
+  ABT_thread_self(&self);
+  if (self == ABT_THREAD_NULL) {
+    KMP_ASSERT(__kmp_init_gtid);
+    /* External threads might call OpenMP functions. */
+    int gtid = (size_t)pthread_getspecific(__kmp_gtid_threadprivate_key);
+    KA_TRACE(50, ("__kmp_abt_set_self_info: key:%d gtid:%d\n",
+                  __kmp_gtid_threadprivate_key, gtid));
+    __kmp_threads[gtid] = th;
+  } else {
+    int ret = ABT_thread_set_arg(self, (void *)th);
+    KMP_ASSERT(ret == ABT_SUCCESS);
+  }
+}
+
+kmp_info_t *__kmp_abt_get_self_info(void) {
+  ABT_thread self;
+
+  KMP_ASSERT(__kmp_init_runtime);
+  ABT_thread_self(&self);
+  if (self == ABT_THREAD_NULL) {
+    KMP_ASSERT(__kmp_init_gtid);
+    /* External threads might call OpenMP functions. */
+    int gtid = (size_t)pthread_getspecific(__kmp_gtid_threadprivate_key);
+    KA_TRACE(50, ("__kmp_abt_get_self_info: key:%d gtid:%d\n",
+                  __kmp_gtid_threadprivate_key, gtid));
+    return __kmp_threads[gtid];
+  } else {
+    kmp_info_t *th;
+    int ret = ABT_thread_get_arg(self, (void **)&th);
+    KMP_ASSERT(th != NULL);
+    KMP_ASSERT(ret == ABT_SUCCESS);
+    return th;
+  }
+}
+
+static void __kmp_abt_initialize(void) {
+  int status;
+  int num_xstreams;
+  int i, k;
+  kmp_abt_affinity_places_t *p_affinity_places = NULL;
+
+  {
+    int verbose = 0;
+    const char *env = getenv("KMP_ABT_VERBOSE");
+    if (env && atoi(env) != 0) {
+      verbose = 1;
+      printf("=== BOLT info (KMP_ABT_VERBOSE) ===\n");
+    }
+
+    env = getenv("KMP_ABT_NUM_ESS");
+    if (env) {
+      num_xstreams = atoi(env);
+      if (num_xstreams < __kmp_xproc)
+        __kmp_xproc = num_xstreams;
+    } else {
+      num_xstreams = __kmp_xproc;
+    }
+    if (verbose)
+      printf("KMP_ABT_NUM_ESS = %d\n", num_xstreams);
+
+    env = getenv("OMP_PLACES");
+    if (!env) {
+      env = getenv("KMP_AFFINITY");
+      if (!env) {
+        env = "threads";
+      } else {
+        if (verbose)
+          printf("[warning] BOLT does not support KMP_AFFINITY; "
+                 "parse it as OMP_PLACES.\n");
+      }
+    }
+    p_affinity_places = __kmp_abt_parse_affinity(num_xstreams, env, strlen(env),
+                                                 verbose);
+    {
+      bool failure = false;
+      for (int rank = 0; rank < num_xstreams; rank++) {
+        int num_assoc_places = 0;
+        int num_places = p_affinity_places->num_places;
+        for (int place_id = 0; place_id < num_places; place_id++) {
+          kmp_abt_affinity_place_t *p_place =
+              p_affinity_places->p_places[place_id];
+          for (int i = 0, num_ranks = p_place->num_ranks; i < num_ranks; i++) {
+            if (p_place->ranks[i] == rank)
+              num_assoc_places++;
+          }
+        }
+        if (num_assoc_places > 1) {
+          failure = true;
+          break;
+        }
+      }
+      if (failure) {
+        printf("[warning] More than one place are associated with the same "
+               "processor; fall back to a default affinity.\n");
+        __kmp_abt_affinity_places_free(p_affinity_places);
+        p_affinity_places = __kmp_abt_parse_affinity(num_xstreams, "threads",
+                                                     strlen("threads"),
+                                                     verbose);
+      }
+    }
+
+    env = getenv("KMP_ABT_FORK_CUTOFF");
+    if (env) {
+      __kmp_abt_global.fork_cutoff = atoi(env);
+      if (__kmp_abt_global.fork_cutoff <= 0)
+        __kmp_abt_global.fork_cutoff = 1;
+    } else {
+      __kmp_abt_global.fork_cutoff = KMP_ABT_FORK_CUTOFF_DEFAULT;
+    }
+    if (verbose)
+      printf("KMP_ABT_FORK_CUTOFF = %d\n", __kmp_abt_global.fork_cutoff);
+
+    env = getenv("KMP_ABT_FORK_NUM_WAYS");
+    if (env) {
+      __kmp_abt_global.fork_num_ways = atoi(env);
+      if (__kmp_abt_global.fork_num_ways <= 1)
+        __kmp_abt_global.fork_num_ways = 2;
+    } else {
+      __kmp_abt_global.fork_num_ways = KMP_ABT_FORK_NUM_WAYS_DEFAULT;
+    }
+    if (verbose)
+      printf("KMP_ABT_FORK_NUM_WAYS = %d\n", __kmp_abt_global.fork_num_ways);
+
+    env = getenv("KMP_ABT_SCHED_SLEEP");
+    if (env) {
+      __kmp_abt_global.is_sched_sleep = atoi(env);
+    } else {
+      __kmp_abt_global.is_sched_sleep = KMP_ABT_SCHED_SLEEP_DEFAULT;
+    }
+    if (verbose)
+      printf("KMP_ABT_SCHED_SLEEP = %d\n", __kmp_abt_global.is_sched_sleep);
+
+    env = getenv("KMP_ABT_SCHED_MIN_SLEEP_NSEC");
+    if (env) {
+      __kmp_abt_global.sched_sleep_min_nsec = atoi(env);
+      if (__kmp_abt_global.sched_sleep_min_nsec <= 0)
+        __kmp_abt_global.sched_sleep_min_nsec = 0;
+    } else {
+      __kmp_abt_global.sched_sleep_min_nsec
+          = KMP_ABT_SCHED_MIN_SLEEP_NSEC_DEFAULT;
+    }
+    if (verbose)
+      printf("KMP_ABT_SCHED_MIN_SLEEP_NSEC = %d\n",
+             __kmp_abt_global.sched_sleep_min_nsec);
+
+    env = getenv("KMP_ABT_SCHED_MAX_SLEEP_NSEC");
+    if (env) {
+      __kmp_abt_global.sched_sleep_max_nsec = atoi(env);
+      if (__kmp_abt_global.sched_sleep_max_nsec
+          < __kmp_abt_global.sched_sleep_min_nsec)
+        __kmp_abt_global.sched_sleep_max_nsec
+            = __kmp_abt_global.sched_sleep_min_nsec;
+    } else {
+      __kmp_abt_global.sched_sleep_max_nsec
+          = KMP_ABT_SCHED_MAX_SLEEP_NSEC_DEFAULT;
+    }
+    if (verbose)
+      printf("KMP_ABT_SCHED_MAX_SLEEP_NSEC = %d\n",
+             __kmp_abt_global.sched_sleep_max_nsec);
+
+    env = getenv("KMP_ABT_SCHED_EVENT_FREQ");
+    if (env) {
+      __kmp_abt_global.sched_event_freq = atoi(env);
+      if (__kmp_abt_global.sched_event_freq <= 1)
+        __kmp_abt_global.sched_event_freq = 1;
+      if (__kmp_abt_global.sched_event_freq > KMP_ABT_SCHED_EVENT_FREQ_MAX)
+        __kmp_abt_global.sched_event_freq = KMP_ABT_SCHED_EVENT_FREQ_MAX;
+    } else {
+      __kmp_abt_global.sched_event_freq = KMP_ABT_SCHED_EVENT_FREQ_DEFAULT;
+    }
+    // Must be 2^N
+    for (int digit = 0;; digit++) {
+      if ((1 << digit) >= __kmp_abt_global.sched_event_freq) {
+        __kmp_abt_global.sched_event_freq = 1 << digit;
+        break;
+       }
+    }
+    if (verbose)
+      printf("KMP_ABT_SCHED_EVENT_FREQ = %d\n",
+             __kmp_abt_global.sched_event_freq);
+
+    env = getenv("KMP_ABT_WORK_STEAL_FREQ");
+    if (env) {
+      __kmp_abt_global.work_steal_freq = atoi(env);
+      if (__kmp_abt_global.work_steal_freq <= 0)
+        __kmp_abt_global.work_steal_freq = 0;
+    } else {
+      __kmp_abt_global.work_steal_freq = KMP_ABT_WORK_STEAL_FREQ_DEFAULT;
+    }
+    // Must be 2^N
+    if (__kmp_abt_global.work_steal_freq != 0) {
+      for (uint32_t digit = 0;; digit++) {
+        if ((1u << digit) >= __kmp_abt_global.work_steal_freq) {
+          __kmp_abt_global.work_steal_freq = 1u << digit;
+          break;
+         }
+      }
+    }
+    if (verbose)
+      printf("KMP_ABT_WORK_STEAL_FREQ = %ud\n",
+             (unsigned int)__kmp_abt_global.work_steal_freq);
+  }
+
+  KA_TRACE(10, ("__kmp_abt_initialize: # of ESs = %d\n", num_xstreams));
+
+  __kmp_abt_global.locals = (kmp_abt_local *)__kmp_allocate
+      (sizeof(kmp_abt_local) * num_xstreams);
+  __kmp_abt_global.num_xstreams = num_xstreams;
+  for (int rank = 0; rank < num_xstreams; rank++) {
+    __kmp_abt_global.locals[rank].place_id = -1;
+    __kmp_abt_global.locals[rank].place_pool = ABT_POOL_NULL;
+  }
+
+  /* Create place pools. */
+  const int num_places = p_affinity_places->num_places;
+  KMP_ASSERT(num_places != 0);
+  ABT_pool *place_pools = (ABT_pool *)__kmp_allocate(sizeof(ABT_pool)
+                                                     * num_places);
+  __kmp_abt_global.num_places = num_places;
+  __kmp_abt_global.place_pools = place_pools;
+  for (int place_id = 0; place_id < num_places; place_id++) {
+    const int num_ranks = p_affinity_places->p_places[place_id]->num_ranks;
+    ABT_pool_access access = (num_ranks == 1) ? ABT_POOL_ACCESS_MPSC
+                                              : ABT_POOL_ACCESS_MPMC;
+    status = ABT_pool_create_basic(ABT_POOL_FIFO, access, ABT_TRUE,
+                                   &place_pools[place_id]);
+    KMP_CHECK_SYSFAIL("ABT_pool_create_basic", status);
+    for (int i = 0; i < num_ranks; i++) {
+      int rank = p_affinity_places->p_places[place_id]->ranks[i];
+      __kmp_abt_global.locals[rank].place_id = place_id;
+      __kmp_abt_global.locals[rank].place_pool = place_pools[place_id];
+    }
+  }
+  __kmp_abt_affinity_places_free(p_affinity_places);
+
+  /* Create shared/private pools */
+  for (i = 0; i < num_xstreams; i++) {
+    status = ABT_pool_create_basic(ABT_POOL_FIFO, ABT_POOL_ACCESS_MPMC,
+                                   ABT_TRUE,
+                                   &__kmp_abt_global.locals[i].shared_pool);
+    KMP_CHECK_SYSFAIL("ABT_pool_create_basic", status);
+  }
+
+  /* Create schedulers */
+  ABT_sched_def sched_def = {
+    .type = ABT_SCHED_TYPE_ULT,
+    .init = __kmp_abt_sched_init,
+    .run = __kmp_abt_sched_run,
+    .free = __kmp_abt_sched_free,
+    .get_migr_pool = NULL
+  };
+
+  ABT_pool *my_pools;
+  my_pools = (ABT_pool *)malloc((num_xstreams + 1) * sizeof(ABT_pool));
+
+  for (i = 0; i < num_xstreams; i++) {
+    for (k = 0; k < num_xstreams; k++) {
+      my_pools[k] =
+          __kmp_abt_global.locals[(i + k) % num_xstreams].shared_pool;
+    }
+    int num_pools = num_xstreams;
+    if (__kmp_abt_global.locals[i].place_id != -1) {
+      my_pools[num_pools++] = __kmp_abt_global.locals[i].place_pool;
+    }
+    status = ABT_sched_create(&sched_def, num_pools, my_pools,
+                              ABT_SCHED_CONFIG_NULL,
+                              &__kmp_abt_global.locals[i].sched);
+    KMP_CHECK_SYSFAIL("ABT_sched_create", status);
+  }
+
+  free(my_pools);
+
+  /* Create ESs */
+  status = ABT_xstream_self(&__kmp_abt_global.locals[0].xstream);
+  KMP_CHECK_SYSFAIL("ABT_xstream_self", status);
+  status = ABT_xstream_set_main_sched(__kmp_abt_global.locals[0].xstream,
+                                      __kmp_abt_global.locals[0].sched);
+  KMP_CHECK_SYSFAIL("ABT_xstream_set_main_sched", status);
+  for (i = 1; i < num_xstreams; i++) {
+    status = ABT_xstream_create(__kmp_abt_global.locals[i].sched,
+                                &__kmp_abt_global.locals[i].xstream);
+    KMP_CHECK_SYSFAIL("ABT_xstream_create", status);
+  }
+}
+
+static void __kmp_abt_finalize(void) {
+  int status;
+  int i;
+
+  for (i = 1; i < __kmp_abt_global.num_xstreams; i++) {
+    status = ABT_xstream_join(__kmp_abt_global.locals[i].xstream);
+    KMP_CHECK_SYSFAIL("ABT_xstream_join", status);
+    status = ABT_xstream_free(&__kmp_abt_global.locals[i].xstream);
+    KMP_CHECK_SYSFAIL("ABT_xstream_free", status);
+  }
+
+  /* Free schedulers */
+  for (i = 1; i < __kmp_abt_global.num_xstreams; i++) {
+    status = ABT_sched_free(&__kmp_abt_global.locals[i].sched);
+    KMP_CHECK_SYSFAIL("ABT_sched_free", status);
+  }
+
+  __kmp_free(__kmp_abt_global.locals);
+  __kmp_free(__kmp_abt_global.place_pools);
+  __kmp_abt_global.num_xstreams = 0;
+  __kmp_abt_global.locals = NULL;
+  __kmp_abt_global.place_pools = NULL;
+}
+
+volatile int __kmp_abt_init_global = FALSE;
+void __kmp_abt_global_initialize() {
+  int status;
+  // Initialize Argobots before other initializations.
+  status = ABT_init(0, NULL);
+  KMP_CHECK_SYSFAIL("ABT_init", status);
+  __kmp_abt_init_global = TRUE;
+}
+
+void __kmp_abt_global_destroy() {
+  ABT_finalize();
+  __kmp_abt_init_global = FALSE;
+}
+
+static int __kmp_abt_sched_init(ABT_sched sched, ABT_sched_config config) {
+  return ABT_SUCCESS;
+}
+
+static void __kmp_abt_sched_run(ABT_sched sched) {
+  uint32_t work_count = 0;
+  int num_pools, num_shared_pools = __kmp_abt_global.num_xstreams;
+  int rank;
+  ABT_xstream_self_rank(&rank);
+  ABT_pool *shared_pools;
+  ABT_pool place_pool;
+  uint32_t seed;
+  const int sched_event_freq = __kmp_abt_global.sched_event_freq;
+  const int sched_sleep_min_nsec = __kmp_abt_global.sched_sleep_min_nsec;
+  const int sched_sleep_max_nsec = __kmp_abt_global.sched_sleep_max_nsec;
+  int sched_sleep_nsec = __kmp_abt_global.is_sched_sleep ? sched_sleep_min_nsec
+                                                         : -1;
+  const uint32_t work_steal_freq = __kmp_abt_global.work_steal_freq;
+  do {
+    seed = (uint32_t)time(NULL) + 64 + rank;
+  } while (seed == 0);
+  KMP_DEBUG_ASSERT(!(sched_event_freq & (sched_event_freq - 1))); // must be 2^N
+  const uint32_t sched_event_freq_mask = sched_event_freq - 1;
+  KMP_DEBUG_ASSERT(!(work_steal_freq & (work_steal_freq - 1))); // must be 2^N
+  const uint32_t work_steal_freq_mask = work_steal_freq - 1;
+
+  ABT_sched_get_num_pools(sched, &num_pools);
+  shared_pools = (ABT_pool *)alloca(num_pools * sizeof(ABT_pool));
+  ABT_sched_get_pools(sched, num_pools, 0, shared_pools);
+  place_pool = __kmp_abt_global.locals[rank].place_pool;
+
+  while (1) {
+    ABT_unit unit;
+    int run_cnt = 0;
+
+    /* From the place pool */
+    if (place_pool != ABT_POOL_NULL) {
+      ABT_pool_pop(place_pool, &unit);
+      if (unit != ABT_UNIT_NULL) {
+        ABT_xstream_run_unit(unit, place_pool);
+        run_cnt++;
+      }
+    }
+
+    /* From the shared pool */
+    ABT_pool_pop(shared_pools[0], &unit);
+    if (unit != ABT_UNIT_NULL) {
+      ABT_xstream_run_unit(unit, shared_pools[0]);
+      run_cnt++;
+    }
+
+    /* Steal a work unit from other pools */
+    if (num_shared_pools >= 2
+        && (run_cnt == 0 || !(work_count & work_steal_freq_mask))) {
+      int target = __kmp_abt_fast_rand32(&seed) %
+                   ((uint32_t)(num_shared_pools - 1)) + 1;
+      ABT_pool_pop(shared_pools[target], &unit);
+      if (unit != ABT_UNIT_NULL) {
+        ABT_unit_set_associated_pool(unit, shared_pools[0]);
+        ABT_xstream_run_unit(unit, shared_pools[0]);
+        run_cnt++;
+      }
+    }
+
+    if (!(++work_count & sched_event_freq_mask)) {
+      ABT_bool stop;
+      ABT_xstream_check_events(sched);
+      ABT_sched_has_to_stop(sched, &stop);
+      if (stop == ABT_TRUE)
+        break;
+      if (sched_sleep_nsec >= 0) {
+        if (run_cnt == 0) {
+          struct timespec sleep_time;
+          sleep_time.tv_sec = 0;
+          sleep_time.tv_nsec = sched_sleep_nsec;
+          nanosleep(&sleep_time, NULL);
+          sched_sleep_nsec = (sched_sleep_nsec == 0) ? 1
+                              : (sched_sleep_nsec << 1);
+          if (sched_sleep_nsec > sched_sleep_max_nsec) {
+            sched_sleep_nsec = sched_sleep_max_nsec;
+          }
+        } else {
+          sched_sleep_nsec = sched_sleep_min_nsec;
+        }
+      }
+    }
+  }
+}
+
+static int __kmp_abt_sched_free(ABT_sched sched) {
+    return ABT_SUCCESS;
+}
+
+static inline void __kmp_abt_free_task(kmp_info_t *th, kmp_taskdata_t *taskdata)
+{
+  int gtid = __kmp_gtid_from_thread(th);
+
+  KA_TRACE(30, ("__kmp_abt_free_task: (enter) T#%d - task %p\n",
+                gtid, taskdata));
+
+  /* [AC] we need those steps to mark the task as finished so the dependencies
+   *  can be completed */
+  taskdata->td_flags.complete = 1; // mark the task as completed
+  __kmp_release_deps(gtid, taskdata);
+  taskdata->td_flags.executing = 0; // suspend the finishing task
+
+  // Wait for all tasks after releasing (=pushing) dependent tasks
+  __kmp_abt_wait_child_tasks(th, true, FALSE);
+
+  taskdata->td_flags.freed = 1;
+
+  /* Free the task queue if it was allocated. */
+  if (taskdata->td_task_queue) {
+    KMP_DEBUG_ASSERT(taskdata->td_tq_cur_size == 0);
+    KMP_INTERNAL_FREE(taskdata->td_task_queue);
+  }
+
+  // deallocate the taskdata and shared variable blocks associated with this
+  // task
+#if USE_FAST_MEMORY
+  __kmp_fast_free(th, taskdata);
+#else
+  __kmp_thread_free(th, taskdata);
+#endif
+
+  KA_TRACE(30, ("__kmp_abt_free_task: (exit) T#%d - task %p\n",
+                gtid, taskdata));
+}
+
+static void __kmp_abt_execute_task(void *arg) {
+  // It is corresponding to __kmp_execute_tasks_.
+
+  kmp_task_t *task = (kmp_task_t *)arg;
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  kmp_info_t *th;
+
+  th = __kmp_abt_bind_task_to_thread(taskdata->td_team, taskdata);
+
+  KA_TRACE(20, ("__kmp_abt_execute_task: T#%d before executing task %p.\n",
+                __kmp_gtid_from_thread(th), task));
+
+  // See __kmp_task_start
+  taskdata->td_flags.started = 1;
+  taskdata->td_flags.executing = 1;
+  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
+
+  while (1) {
+    // Run __kmp_invoke_task to handle internal counters correctly.
+#ifdef KMP_GOMP_COMPAT
+    if (taskdata->td_flags.native) {
+      ((void (*)(void *))(*(task->routine)))(task->shareds);
+    } else
+#endif /* KMP_GOMP_COMPAT */
+    {
+      (*(task->routine))(__kmp_gtid_from_thread(th), task);
+    }
+
+    if (!taskdata->td_flags.tiedness) {
+      // If this task is an untied one, we need to retrieve kmp_info because it
+      // may have been changed.
+      th = __kmp_abt_get_self_info();
+    }
+    // See __kmp_task_finish (untied)
+    if (taskdata->td_flags.tiedness == TASK_UNTIED) {
+      // Check if we can finish this task.
+      kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
+      if (counter > 0) {
+        // We should keep this ULT.
+        continue;
+      }
+    }
+    // tied or finished untied.
+    break;
+  }
+
+  // See __kmp_task_finish (tied/finished untied)
+  // KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
+  taskdata->td_flags.executing = 0;
+  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
+  taskdata->td_flags.complete = 1; // mark the task as completed
+  // KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
+  // KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
+
+  // Free this task.
+  __kmp_abt_free_task(th, taskdata);
+
+  // Reset th's ownership.
+  __kmp_abt_release_info(th);
+
+  KA_TRACE(20, ("__kmp_abt_execute_task: T#%d after executing task %p.\n",
+                __kmp_gtid_from_thread(th), task));
+}
+
+int __kmp_abt_create_task(kmp_info_t *th, kmp_task_t *task) {
+  int status;
+  ABT_pool dest = __kmp_abt_get_pool_task();
+
+  KA_TRACE(20, ("__kmp_abt_create_task: T#%d before creating task %p into the "
+                "pool %p.\n", __kmp_gtid_from_thread(th), task, dest));
+
+  /* Check if the task queue has an empty slot. */
+  kmp_taskdata_t *td = th->th.th_current_task;
+  if (td->td_tq_cur_size == td->td_tq_max_size) {
+    size_t new_max_size;
+    if (td->td_tq_max_size == 0) {
+      /* Empty queue. We allocate 32 slots by default. */
+      new_max_size = 32;
+    } else {
+      /* The task queue is full. Expand it. */
+      new_max_size = td->td_tq_max_size * 2;
+    }
+
+    void *queue = (void *)td->td_task_queue;
+    size_t size = sizeof(kmp_abt_task_t) * new_max_size;
+    td->td_task_queue = (kmp_abt_task_t *)KMP_INTERNAL_REALLOC(queue, size);
+    td->td_tq_max_size = new_max_size;
+  }
+
+  status = ABT_thread_create(dest, __kmp_abt_execute_task, (void *)task,
+                             ABT_THREAD_ATTR_NULL,
+                             &td->td_task_queue[td->td_tq_cur_size++]);
+  KMP_ASSERT(status == ABT_SUCCESS);
+
+  KA_TRACE(20, ("__kmp_abt_create_task: T#%d after creating task %p into the "
+                "pool %p.\n", __kmp_gtid_from_thread(th), task, dest));
+
+  return TRUE;
+}
+
+kmp_info_t *__kmp_abt_wait_child_tasks(kmp_info_t *th, bool thread_bind,
+                                       int yield) {
+  KA_TRACE(20, ("__kmp_abt_wait_child_tasks: T#%d enter\n",
+                __kmp_gtid_from_thread(th)));
+
+  int i, status;
+  kmp_taskdata_t *taskdata = th->th.th_current_task;
+  // Get the associated team before releasing the ownership of th.
+  kmp_team_t *team = th->th.th_team;
+  kmp_info_t *new_th = th;
+
+  if (taskdata->td_tq_cur_size == 0) {
+    /* leaf task case */
+    if (yield) {
+      __kmp_abt_release_info(th);
+
+      ABT_thread_yield();
+
+      if (thread_bind || taskdata->td_flags.tiedness) {
+        __kmp_abt_acquire_info_for_task(th, taskdata, team);
+      } else {
+        new_th = __kmp_abt_bind_task_to_thread(team, taskdata);
+      }
+    }
+    KA_TRACE(20, ("__kmp_abt_wait_child_tasks: T#%d done\n",
+                  __kmp_gtid_from_thread(new_th)));
+    return new_th;
+  }
+
+  /* Let others, e.g., tasks, can use this kmp_info */
+  __kmp_abt_release_info(th);
+
+  /* Give other tasks a chance for execution */
+  if (yield)
+    ABT_thread_yield();
+
+  /* Wait until all child tasks are complete. */
+  for (i = 0; i < taskdata->td_tq_cur_size; i++) {
+    status = ABT_thread_free(&taskdata->td_task_queue[i]);
+    KMP_ASSERT(status == ABT_SUCCESS);
+  }
+  taskdata->td_tq_cur_size = 0;
+
+  if (thread_bind || taskdata->td_flags.tiedness) {
+    /* Obtain kmp_info to continue the original task. */
+    __kmp_abt_acquire_info_for_task(th, taskdata, team);
+  } else {
+    new_th = __kmp_abt_bind_task_to_thread(team, taskdata);
+  }
+
+  KA_TRACE(20, ("__kmp_abt_wait_child_tasks: T#%d done\n",
+                __kmp_gtid_from_thread(new_th)));
+  return new_th;
+}
+
+kmp_info_t *__kmp_abt_bind_task_to_thread(kmp_team_t *team,
+                                          kmp_taskdata_t *taskdata) {
+  int i, i_start, i_end;
+  kmp_info_t *th = NULL;
+
+  KA_TRACE(20, ("__kmp_abt_bind_task_to_thread: (enter) task %p\n", taskdata));
+
+  /* To handle gtid in the task code, we look for a suspended (blocked)
+   * thread in the team and use its info to execute this task. */
+  while (1) {
+    if (team->t.t_level <= 1) {
+      /* outermost team - we try to assign the thread that was executed on
+       * the same ES first and then check other threads in the team.  */
+      int rank;
+      ABT_xstream_self_rank(&rank);
+      if (rank < team->t.t_nproc) {
+        /* [SM] I think this condition should always be true, but just in
+         * case I miss something we check this condition. */
+        i_start = rank;
+        i_end = team->t.t_nproc + rank;
+      } else {
+        i_start = 0;
+        i_end = team->t.t_nproc;
+      }
+    } else {
+      /* nested team - we ignore the ES info since threads in the nested team
+       * may be executed by any ES. */
+      i_start = 0;
+      i_end = team->t.t_nproc;
+    }
+    /* TODO: This is a linear search. Can we do better? */
+    for (i = i_start; i < i_end; i++) {
+      int idx = (i < team->t.t_nproc) ? i : i % team->t.t_nproc;
+      th = team->t.t_threads[idx];
+      ABT_thread ult = th->th.th_info.ds.ds_thread;
+
+      if (th->th.th_active == FALSE && ult != ABT_THREAD_NULL) {
+        /* Try to take the ownership of kmp_info 'th' */
+        if (th->th.th_team != team)
+          continue;
+        if (KMP_COMPARE_AND_STORE_RET32(&th->th.th_active, FALSE, TRUE)
+            == FALSE) {
+          if (th->th.th_team != team) {
+            __kmp_abt_release_info(th);
+            continue;
+          }
+          /* Bind this task as if it is executed by 'th'. */
+          th->th.th_current_task = taskdata;
+          th->th.th_task_team = taskdata->td_task_team;
+          __kmp_abt_set_self_info(th);
+          KA_TRACE(20, ("__kmp_abt_bind_task_to_thread: (exit) task %p"
+                        "bound to T#%d\n",
+                        taskdata, __kmp_gtid_from_thread(th)));
+          return th;
+        }
+      }
+    }
+    /* We could not find an available kmp_info. Thus, this task yields
+     * control to other work units and will try to find one later. */
+    ABT_thread_yield();
+  }
+  return NULL;
+}
+
+void __kmp_abt_create_uber(int gtid, kmp_info_t *th, size_t stack_size) {
+  KMP_DEBUG_ASSERT(KMP_UBER_GTID(gtid));
+  KA_TRACE(10, ("__kmp_abt_create_uber: T#%d\n", gtid));
+  ABT_thread handle;
+  ABT_thread_self(&handle);
+  if (handle == ABT_THREAD_NULL) {
+    // External threads might call this function.  In this case, we do not need
+    // to set `th` since external threads use pthread_setspecific,
+    __kmp_gtid_set_specific(gtid);
+  } else {
+    ABT_thread_set_arg(handle, (void *)th);
+  }
+  th->th.th_info.ds.ds_thread = handle;
+}
+
+#endif // KMP_USE_ABT
+
 // end of file //
diff --git a/runtime/src/z_Windows_NT_util.cpp b/runtime/src/z_Windows_NT_util.cpp
index c149dda56..af231e234 100644
--- a/runtime/src/z_Windows_NT_util.cpp
+++ b/runtime/src/z_Windows_NT_util.cpp
@@ -363,7 +363,7 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
                 th_gtid, flag->get()));
 
   __kmp_suspend_initialize_thread(th);
-  __kmp_win32_mutex_lock(&th->th.th_suspend_mx);
+  __kmp_lock_suspend_mx(th);
 
   KF_TRACE(10, ("__kmp_suspend_template: T#%d setting sleep bit for flag's"
                 " loc(%p)\n",
@@ -375,7 +375,7 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
       __kmp_pause_status != kmp_soft_paused) {
     flag->unset_sleeping();
-    __kmp_win32_mutex_unlock(&th->th.th_suspend_mx);
+    __kmp_unlock_suspend_mx(th);
     return;
   }
 
@@ -437,21 +437,26 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
     }
   }
 
-  __kmp_win32_mutex_unlock(&th->th.th_suspend_mx);
-
+  __kmp_unlock_suspend_mx(th);
   KF_TRACE(30, ("__kmp_suspend_template: T#%d exit\n", th_gtid));
 }
 
-void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag) {
+template <bool C, bool S>
+void __kmp_suspend_32(int th_gtid, kmp_flag_32<C, S> *flag) {
   __kmp_suspend_template(th_gtid, flag);
 }
-void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag) {
+template <bool C, bool S>
+void __kmp_suspend_64(int th_gtid, kmp_flag_64<C, S> *flag) {
   __kmp_suspend_template(th_gtid, flag);
 }
 void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) {
   __kmp_suspend_template(th_gtid, flag);
 }
 
+template void __kmp_suspend_32<false, false>(int, kmp_flag_32<false, false> *);
+template void __kmp_suspend_64<false, true>(int, kmp_flag_64<false, true> *);
+template void __kmp_suspend_64<true, false>(int, kmp_flag_64<true, false> *);
+
 /* This routine signals the thread specified by target_gtid to wake up
    after setting the sleep bit indicated by the flag argument to FALSE */
 template <class C>
@@ -467,7 +472,7 @@ static inline void __kmp_resume_template(int target_gtid, C *flag) {
                 gtid, target_gtid));
 
   __kmp_suspend_initialize_thread(th);
-  __kmp_win32_mutex_lock(&th->th.th_suspend_mx);
+  __kmp_lock_suspend_mx(th);
 
   if (!flag) { // coming from __kmp_null_resume_wrapper
     flag = (C *)th->th.th_sleep_loc;
@@ -481,7 +486,7 @@ static inline void __kmp_resume_template(int target_gtid, C *flag) {
     KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already "
                  "awake: flag's loc(%p)\n",
                  gtid, target_gtid, NULL));
-    __kmp_win32_mutex_unlock(&th->th.th_suspend_mx);
+    __kmp_unlock_suspend_mx(th);
     return;
   } else {
     typename C::flag_t old_spin = flag->unset_sleeping();
@@ -489,7 +494,7 @@ static inline void __kmp_resume_template(int target_gtid, C *flag) {
       KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already "
                    "awake: flag's loc(%p): %u => %u\n",
                    gtid, target_gtid, flag->get(), old_spin, *(flag->get())));
-      __kmp_win32_mutex_unlock(&th->th.th_suspend_mx);
+      __kmp_unlock_suspend_mx(th);
       return;
     }
   }
@@ -499,23 +504,28 @@ static inline void __kmp_resume_template(int target_gtid, C *flag) {
                gtid, target_gtid, flag->get()));
 
   __kmp_win32_cond_signal(&th->th.th_suspend_cv);
-  __kmp_win32_mutex_unlock(&th->th.th_suspend_mx);
+  __kmp_unlock_suspend_mx(th);
 
   KF_TRACE(30, ("__kmp_resume_template: T#%d exiting after signaling wake up"
                 " for T#%d\n",
                 gtid, target_gtid));
 }
 
-void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag) {
+template <bool C, bool S>
+void __kmp_resume_32(int target_gtid, kmp_flag_32<C, S> *flag) {
   __kmp_resume_template(target_gtid, flag);
 }
-void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag) {
+template <bool C, bool S>
+void __kmp_resume_64(int target_gtid, kmp_flag_64<C, S> *flag) {
   __kmp_resume_template(target_gtid, flag);
 }
 void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) {
   __kmp_resume_template(target_gtid, flag);
 }
 
+template void __kmp_resume_32<false, true>(int, kmp_flag_32<false, true> *);
+template void __kmp_resume_64<false, true>(int, kmp_flag_64<false, true> *);
+
 void __kmp_yield() { Sleep(0); }
 
 void __kmp_gtid_set_specific(int gtid) {
@@ -1504,7 +1514,7 @@ void __kmp_free_handle(kmp_thread_t tHandle) {
 int __kmp_get_load_balance(int max) {
   static ULONG glb_buff_size = 100 * 1024;
 
-  // Saved count of the running threads for the thread balance algortihm
+  // Saved count of the running threads for the thread balance algorithm
   static int glb_running_threads = 0;
   static double glb_call_time = 0; /* Thread balance algorithm call time */
 
diff --git a/runtime/test/CMakeLists.txt b/runtime/test/CMakeLists.txt
index 851377f73..fcb46ff85 100644
--- a/runtime/test/CMakeLists.txt
+++ b/runtime/test/CMakeLists.txt
@@ -2,6 +2,12 @@
 include(CheckFunctionExists)
 include(CheckLibraryExists)
 
+# Remove -Werror
+set(CMAKE_C_FLAGS "${CMAKE_ORIG_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_ORIG_CXX_FLAGS}")
+string(REPLACE " -Werror" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+string(REPLACE " -Werror" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
 # Some tests use math functions
 check_library_exists(m sqrt "" LIBOMP_HAVE_LIBM)
 # When using libgcc, -latomic may be needed for atomics
@@ -16,6 +22,10 @@ else()
   set(LIBOMP_HAVE_LIBATOMIC 0)
 endif()
 
+# Undo changes
+set(CMAKE_ORIG_C_FLAGS "${CMAKE_C_FLAGS}")
+set(CMAKE_ORIG_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
 macro(pythonize_bool var)
   if (${var})
     set(${var} True)
@@ -25,15 +35,18 @@ macro(pythonize_bool var)
 endmacro()
 
 pythonize_bool(LIBOMP_USE_HWLOC)
-pythonize_bool(LIBOMP_OMPT_SUPPORT)
+pythonize_bool(LIBBOLT_OMPT_SUPPORT)
 pythonize_bool(LIBOMP_OMPT_OPTIONAL)
 pythonize_bool(LIBOMP_HAVE_LIBM)
 pythonize_bool(LIBOMP_HAVE_LIBATOMIC)
 
-add_openmp_testsuite(check-libomp "Running libomp tests" ${CMAKE_CURRENT_BINARY_DIR} DEPENDS omp)
+add_library(bolt-ompt-print-callback INTERFACE)
+target_include_directories(bolt-ompt-print-callback INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/ompt)
+
+
+add_openmp_testsuite(check-bolt-libomp "Running libomp tests" ${CMAKE_CURRENT_BINARY_DIR} DEPENDS bolt-omp)
 # Add target check-ompt, but make sure to not add the tests twice to check-openmp.
-set(EXCLUDE_FROM_ALL True)
-add_openmp_testsuite(check-ompt "Running OMPT tests" ${CMAKE_CURRENT_BINARY_DIR}/ompt DEPENDS omp)
+add_openmp_testsuite(check-bolt-ompt "Running OMPT tests" ${CMAKE_CURRENT_BINARY_DIR}/ompt EXCLUDE_FROM_CHECK_ALL DEPENDS bolt-omp)
 
 # Configure the lit.site.cfg.in file
 set(AUTO_GEN_COMMENT "## Autogenerated by libomp configuration.\n# Do not edit!")
diff --git a/runtime/test/affinity/format/affinity_display.1.c b/runtime/test/affinity/format/affinity_display.1.c
index fe357d3b3..bb052c0c4 100644
--- a/runtime/test/affinity/format/affinity_display.1.c
+++ b/runtime/test/affinity/format/affinity_display.1.c
@@ -1,5 +1,6 @@
 // RUN: %libomp-compile
 // RUN: env OMP_DISPLAY_AFFINITY=TRUE OMP_NUM_THREADS=4 OMP_PLACES='{0,1},{2,3},{4,5},{6,7}' %libomp-run | %python %S/check.py -c 'CHECK' %s
+// REQUIRES: !abt
 
 // Affinity Display examples
 #include <stdio.h>
diff --git a/runtime/test/affinity/format/affinity_values.c b/runtime/test/affinity/format/affinity_values.c
index 37ab2101a..5e72f6703 100644
--- a/runtime/test/affinity/format/affinity_values.c
+++ b/runtime/test/affinity/format/affinity_values.c
@@ -4,7 +4,7 @@
 // RUN: env OMP_PROC_BIND=close OMP_PLACES=sockets %libomp-run
 // RUN: env KMP_AFFINITY=compact %libomp-run
 // RUN: env KMP_AFFINITY=scatter %libomp-run
-// REQUIRES: affinity
+// REQUIRES: affinity && !abt
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/runtime/test/affinity/format/api.c b/runtime/test/affinity/format/api.c
index 08805e7a5..416678756 100644
--- a/runtime/test/affinity/format/api.c
+++ b/runtime/test/affinity/format/api.c
@@ -1,5 +1,6 @@
 // RUN: %libomp-compile-and-run
 // RUN: %libomp-run | %python %S/check.py -c 'CHECK' %s
+// REQUIRES: !abt
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/runtime/test/affinity/format/api2.c b/runtime/test/affinity/format/api2.c
index c32da938f..9b4b50c00 100644
--- a/runtime/test/affinity/format/api2.c
+++ b/runtime/test/affinity/format/api2.c
@@ -1,5 +1,6 @@
 // RUN: %libomp-compile-and-run
 // RUN: %libomp-run | %python %S/check.py -c 'CHECK' %s
+// REQUIRES: !abt
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -13,8 +14,8 @@
 
 #define check(condition)                                                       \
   if (!(condition)) {                                                          \
-    fprintf(stderr, "error: %s: %d: " STR(condition) "\n", __FILE__,           \
-            __LINE__);                                                         \
+    fprintf(stderr, "error: %s: %d: %s\n", __FILE__, __LINE__,                 \
+            STR(condition));                                                   \
     exit(1);                                                                   \
   }
 
diff --git a/runtime/test/affinity/format/fields_modifiers.c b/runtime/test/affinity/format/fields_modifiers.c
index c18027157..719c8fa88 100644
--- a/runtime/test/affinity/format/fields_modifiers.c
+++ b/runtime/test/affinity/format/fields_modifiers.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// REQUIRES: !abt
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/runtime/test/affinity/format/fields_values.c b/runtime/test/affinity/format/fields_values.c
index e56ce2786..5a2b0f52e 100644
--- a/runtime/test/affinity/format/fields_values.c
+++ b/runtime/test/affinity/format/fields_values.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// REQUIRES: !abt
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/runtime/test/affinity/format/increase.c b/runtime/test/affinity/format/increase.c
index b3942dbbc..cf08c27b9 100644
--- a/runtime/test/affinity/format/increase.c
+++ b/runtime/test/affinity/format/increase.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile && env OMP_DISPLAY_AFFINITY=true %libomp-run | %python %S/check.py -c 'CHECK' %s
+// REQUIRES: !abt
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/runtime/test/affinity/format/nested.c b/runtime/test/affinity/format/nested.c
index 2ecc91897..ec6e12d25 100644
--- a/runtime/test/affinity/format/nested.c
+++ b/runtime/test/affinity/format/nested.c
@@ -1,5 +1,5 @@
 // RUN: %libomp-compile && env OMP_DISPLAY_AFFINITY=true OMP_PLACES=threads OMP_PROC_BIND=spread,close %libomp-run | %python %S/check.py -c 'CHECK' %s
-// REQUIRES: affinity
+// REQUIRES: affinity && !abt
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/runtime/test/affinity/format/nested2.c b/runtime/test/affinity/format/nested2.c
index 4b54912d2..7e5b8f529 100644
--- a/runtime/test/affinity/format/nested2.c
+++ b/runtime/test/affinity/format/nested2.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile && env OMP_DISPLAY_AFFINITY=true OMP_PLACES=threads OMP_PROC_BIND=spread,close KMP_HOT_TEAMS_MAX_LEVEL=2 %libomp-run | %python %S/check.py -c 'CHECK' %s
+// REQUIRES: !abt
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/runtime/test/affinity/format/nested_mixed.c b/runtime/test/affinity/format/nested_mixed.c
index 1e4c75372..5581e3591 100644
--- a/runtime/test/affinity/format/nested_mixed.c
+++ b/runtime/test/affinity/format/nested_mixed.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile && env OMP_DISPLAY_AFFINITY=true %libomp-run | %python %S/check.py -c 'CHECK' %s
+// REQUIRES: !abt
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/runtime/test/affinity/format/nested_serial.c b/runtime/test/affinity/format/nested_serial.c
index 8b84ba65c..a452b34dc 100644
--- a/runtime/test/affinity/format/nested_serial.c
+++ b/runtime/test/affinity/format/nested_serial.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile && env OMP_DISPLAY_AFFINITY=true %libomp-run | %python %S/check.py -c 'CHECK' %s
+// REQUIRES: !abt
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/runtime/test/affinity/format/proc_bind.c b/runtime/test/affinity/format/proc_bind.c
index 765c3ceaa..f1da2ac65 100644
--- a/runtime/test/affinity/format/proc_bind.c
+++ b/runtime/test/affinity/format/proc_bind.c
@@ -1,5 +1,5 @@
 // RUN: %libomp-compile && env OMP_DISPLAY_AFFINITY=true OMP_PLACES='{0},{0,1},{0},{0,1},{0},{0,1},{0},{0,1},{0},{0,1},{0}' %libomp-run | %python %S/check.py -c 'CHECK' %s
-// REQUIRES: affinity
+// REQUIRES: affinity && !abt
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/runtime/test/affinity/format/simple.c b/runtime/test/affinity/format/simple.c
index 701c20720..55910d297 100644
--- a/runtime/test/affinity/format/simple.c
+++ b/runtime/test/affinity/format/simple.c
@@ -5,6 +5,7 @@
 // RUN: env OMP_DISPLAY_AFFINITY=true OMP_NUM_THREADS=3 %libomp-run | %python %S/check.py -c 'CHECK-3' %s
 // RUN: env OMP_DISPLAY_AFFINITY=true OMP_NUM_THREADS=4 %libomp-run | %python %S/check.py -c 'CHECK-4' %s
 // RUN: env OMP_DISPLAY_AFFINITY=true OMP_NUM_THREADS=8 %libomp-run | %python %S/check.py -c 'CHECK-8' %s
+// REQUIRES: !abt
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/runtime/test/affinity/format/simple_env.c b/runtime/test/affinity/format/simple_env.c
index ad0a2651e..ae6c89ef0 100644
--- a/runtime/test/affinity/format/simple_env.c
+++ b/runtime/test/affinity/format/simple_env.c
@@ -1,5 +1,6 @@
 // RUN: %libomp-compile
 // RUN: env OMP_DISPLAY_AFFINITY=true OMP_AFFINITY_FORMAT='TESTER-ENV: tl:%L tn:%n nt:%N' OMP_NUM_THREADS=8 %libomp-run | %python %S/check.py -c 'CHECK-8' %s
+// REQUIRES: !abt
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -13,4 +14,4 @@ int main(int argc, char** argv) {
   return 0;
 }
 
-// CHECK-8: num_threads=8 TESTER-ENV: tl:1 tn:[0-7] nt:8
+// CHECK-8: num_threads=8 TESTER-ENV: tl:1 tn:[0-7] nt:8$
diff --git a/runtime/test/api/omp_alloc_def_fb.c b/runtime/test/api/omp_alloc_def_fb.c
index 3795f0990..d75366d84 100644
--- a/runtime/test/api/omp_alloc_def_fb.c
+++ b/runtime/test/api/omp_alloc_def_fb.c
@@ -7,13 +7,13 @@ int main() {
   omp_alloctrait_t at[2];
   omp_allocator_handle_t a;
   void *p[2];
-  at[0].key = OMP_ATK_POOL_SIZE;
+  at[0].key = omp_atk_pool_size;
   at[0].value = 2 * 1024 * 1024;
-  at[1].key = OMP_ATK_FALLBACK;
-  at[1].value = OMP_ATV_DEFAULT_MEM_FB;
+  at[1].key = omp_atk_fallback;
+  at[1].value = omp_atv_default_mem_fb;
   a = omp_init_allocator(omp_large_cap_mem_space, 2, at);
-  printf("allocator large created: %p\n", a);
-  #pragma omp parallel num_threads(2)
+  printf("allocator large created: %p\n", (void *)a);
+#pragma omp parallel num_threads(2)
   {
     int i = omp_get_thread_num();
     p[i] = omp_alloc(1024 * 1024, a);
diff --git a/runtime/test/api/omp_alloc_hbw.c b/runtime/test/api/omp_alloc_hbw.c
index e94454896..2aca26c10 100644
--- a/runtime/test/api/omp_alloc_hbw.c
+++ b/runtime/test/api/omp_alloc_hbw.c
@@ -7,13 +7,13 @@ int main() {
   omp_alloctrait_t at[2];
   omp_allocator_handle_t a;
   void *p[2];
-  at[0].key = OMP_ATK_POOL_SIZE;
+  at[0].key = omp_atk_pool_size;
   at[0].value = 2 * 1024 * 1024;
-  at[1].key = OMP_ATK_FALLBACK;
-  at[1].value = OMP_ATV_NULL_FB;
+  at[1].key = omp_atk_fallback;
+  at[1].value = omp_atv_null_fb;
   a = omp_init_allocator(omp_high_bw_mem_space, 2, at);
-  printf("allocator hbw created: %p\n", a);
-  #pragma omp parallel num_threads(2)
+  printf("allocator hbw created: %p\n", (void *)a);
+#pragma omp parallel num_threads(2)
   {
     int i = omp_get_thread_num();
     p[i] = omp_alloc(1024 * 1024, a);
diff --git a/runtime/test/api/omp_alloc_null_fb.c b/runtime/test/api/omp_alloc_null_fb.c
index 9528c4604..9ed2d8cc4 100644
--- a/runtime/test/api/omp_alloc_null_fb.c
+++ b/runtime/test/api/omp_alloc_null_fb.c
@@ -7,13 +7,13 @@ int main() {
   omp_alloctrait_t at[2];
   omp_allocator_handle_t a;
   void *p[2];
-  at[0].key = OMP_ATK_POOL_SIZE;
+  at[0].key = omp_atk_pool_size;
   at[0].value = 2 * 1024 * 1024;
-  at[1].key = OMP_ATK_FALLBACK;
-  at[1].value = OMP_ATV_NULL_FB;
-  a = omp_init_allocator(omp_large_cap_mem_space, 2, at);
-  printf("allocator large created: %p\n", a);
-  #pragma omp parallel num_threads(2)
+  at[1].key = omp_atk_fallback;
+  at[1].value = omp_atv_null_fb;
+  a = omp_init_allocator(omp_default_mem_space, 2, at);
+  printf("allocator created: %p\n", (void *)a);
+#pragma omp parallel num_threads(2)
   {
     int i = omp_get_thread_num();
     #pragma omp barrier
diff --git a/runtime/test/api/omp_calloc_def_fb.c b/runtime/test/api/omp_calloc_def_fb.c
new file mode 100644
index 000000000..e9b90fbeb
--- /dev/null
+++ b/runtime/test/api/omp_calloc_def_fb.c
@@ -0,0 +1,32 @@
+// RUN: %libomp-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+int main() {
+  omp_alloctrait_t at[2];
+  omp_allocator_handle_t a;
+  void *p[2];
+  at[0].key = omp_atk_pool_size;
+  at[0].value = 2 * 1024 * 1024;
+  at[1].key = omp_atk_fallback;
+  at[1].value = omp_atv_default_mem_fb;
+  a = omp_init_allocator(omp_large_cap_mem_space, 2, at);
+  printf("allocator large created: %p\n", (void *)a);
+  #pragma omp parallel num_threads(2)
+  {
+    int i = omp_get_thread_num();
+    p[i] = omp_calloc(1024, 1024, a);
+    #pragma omp barrier
+    printf("th %d, ptr %p\n", i, p[i]);
+    omp_free(p[i], a);
+  }
+  // Both pointers should be non-NULL
+  if (p[0] != NULL && p[1] != NULL) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed: pointers %p %p\n", p[0], p[1]);
+    return 1;
+  }
+}
diff --git a/runtime/test/api/omp_calloc_size_0.c b/runtime/test/api/omp_calloc_size_0.c
new file mode 100644
index 000000000..0902ca600
--- /dev/null
+++ b/runtime/test/api/omp_calloc_size_0.c
@@ -0,0 +1,33 @@
+// RUN: %libomp-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+int main()
+{
+  omp_alloctrait_t at[2];
+  omp_allocator_handle_t a;
+  void *p[2];
+  at[0].key = omp_atk_pool_size;
+  at[0].value = 2*1024*1024;
+  at[1].key = omp_atk_fallback;
+  at[1].value = omp_atv_default_mem_fb;
+  a = omp_init_allocator(omp_large_cap_mem_space, 2, at);
+  printf("allocator large created: %p\n", (void *)a);
+  #pragma omp parallel num_threads(2)
+  {
+    int i = omp_get_thread_num();
+    p[i] = omp_calloc(1024, 0, a);
+    #pragma omp barrier
+    printf("th %d, ptr %p\n", i, p[i]);
+    omp_free(p[i], a);
+  }
+  // Both pointers should be NULL
+  if (p[0] == NULL && p[1] == NULL) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed: pointers %p %p\n", p[0], p[1]);
+    return 1;
+  }
+}
diff --git a/runtime/test/api/omp_display_env0.c b/runtime/test/api/omp_display_env0.c
new file mode 100644
index 000000000..85d085ecd
--- /dev/null
+++ b/runtime/test/api/omp_display_env0.c
@@ -0,0 +1,14 @@
+// RUN: %libomp-compile-and-run 2>&1 | FileCheck %s
+// RUN: %libomp-cxx-compile-c && %libomp-run 2>&1 | FileCheck %s
+#include <stdio.h>
+#include <omp.h>
+int main()
+{
+  omp_display_env(0);
+  printf("passed\n");
+  return 0;
+}
+
+// CHECK: OPENMP DISPLAY ENVIRONMENT BEGIN
+// CHECK: _OPENMP
+// CHECK: OPENMP DISPLAY ENVIRONMENT END
diff --git a/runtime/test/api/omp_get_num_devices.c b/runtime/test/api/omp_get_num_devices.c
index d534fa3d7..f4dd1e242 100644
--- a/runtime/test/api/omp_get_num_devices.c
+++ b/runtime/test/api/omp_get_num_devices.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// UNSUPPORTED: icc-16, icc-17, icc-18
 #include <stdio.h>
 #include "omp_testsuite.h"
 
diff --git a/runtime/test/api/omp_realloc_def_fb.c b/runtime/test/api/omp_realloc_def_fb.c
new file mode 100644
index 000000000..667172cb6
--- /dev/null
+++ b/runtime/test/api/omp_realloc_def_fb.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+int main() {
+  omp_alloctrait_t at[2];
+  omp_allocator_handle_t a;
+  omp_allocator_handle_t f_a;
+  void *ptr[2];
+  void *nptr[2];
+  at[0].key = omp_atk_pool_size;
+  at[0].value = 2 * 1024 * 1024;
+  at[1].key = omp_atk_fallback;
+  at[1].value = omp_atv_default_mem_fb;
+
+  a = omp_init_allocator(omp_large_cap_mem_space, 2, at);
+  f_a = omp_init_allocator(omp_default_mem_space, 2, at);
+  printf("allocator large created: %p\n", (void *)a);
+  printf("allocator default created: %p\n", (void *)f_a);
+
+  #pragma omp parallel num_threads(2)
+  {
+    int i = omp_get_thread_num();
+    ptr[i] = omp_alloc(1024 * 1024, f_a);
+    #pragma omp barrier
+    nptr[i] = omp_realloc(ptr[i], 1024 * 1024, a, f_a);
+    #pragma omp barrier
+    printf("th %d, nptr %p\n", i, nptr[i]);
+    omp_free(nptr[i], a);
+  }
+  // Both pointers should be non-NULL
+  if (nptr[0] != NULL && nptr[1] != NULL) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed: pointers %p %p\n", nptr[0], nptr[1]);
+    return 1;
+  }
+}
diff --git a/runtime/test/api/omp_realloc_null_ptr.c b/runtime/test/api/omp_realloc_null_ptr.c
new file mode 100644
index 000000000..1483e122c
--- /dev/null
+++ b/runtime/test/api/omp_realloc_null_ptr.c
@@ -0,0 +1,46 @@
+// RUN: %libomp-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+int main()
+{
+  omp_alloctrait_t at[2];
+  omp_allocator_handle_t a;
+  omp_allocator_handle_t f_a;
+  void *ptr[2];
+  void *nptr[2];
+  at[0].key = omp_atk_pool_size;
+  at[0].value = 2*1024*1024;
+  at[1].key = omp_atk_fallback;
+  at[1].value = omp_atv_default_mem_fb;
+
+  a = omp_init_allocator(omp_large_cap_mem_space, 2, at);
+  f_a = omp_init_allocator(omp_default_mem_space, 2, at);
+  printf("allocator large created: %p\n", (void *)a);
+  printf("allocator default created: %p\n", (void *)f_a);
+
+  #pragma omp parallel num_threads(2)
+  {
+    int i = omp_get_thread_num();
+    ptr[i] = omp_alloc(0, f_a);
+    #pragma omp barrier
+    nptr[i] = omp_realloc(ptr[i], 1024 * 1024, a, f_a);
+    #pragma omp barrier
+    printf("th %d, nptr %p\n", i, nptr[i]);
+    omp_free(nptr[i], a);
+  }
+
+  // Both ptr pointers should be NULL
+  if (ptr[0] != NULL || ptr[1] != NULL) {
+    printf("failed: pointers %p %p\n", ptr[0], ptr[1]);
+    return 1;
+  }
+  // Both nptr pointers should be non-NULL
+  if (nptr[0] == NULL || nptr[1] == NULL) {
+    printf("failed: pointers %p %p\n", nptr[0], nptr[1]);
+    return 1;
+  }
+  printf("passed\n");
+  return 0;
+}
diff --git a/runtime/test/api/omp_realloc_size_0.c b/runtime/test/api/omp_realloc_size_0.c
new file mode 100644
index 000000000..bfd027532
--- /dev/null
+++ b/runtime/test/api/omp_realloc_size_0.c
@@ -0,0 +1,46 @@
+// RUN: %libomp-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+int main()
+{
+  omp_alloctrait_t at[2];
+  omp_allocator_handle_t a;
+  omp_allocator_handle_t f_a;
+  void *ptr[2];
+  void *nptr[2];
+  at[0].key = omp_atk_pool_size;
+  at[0].value = 2*1024*1024;
+  at[1].key = omp_atk_fallback;
+  at[1].value = omp_atv_default_mem_fb;
+
+  a = omp_init_allocator(omp_large_cap_mem_space, 2, at);
+  f_a = omp_init_allocator(omp_default_mem_space, 2, at);
+  printf("allocator large created: %p\n", (void *)a);
+  printf("allocator default created: %p\n", (void *)f_a);
+
+  #pragma omp parallel num_threads(2)
+  {
+    int i = omp_get_thread_num();
+    ptr[i] = omp_alloc(1024 * 1024, f_a);
+    #pragma omp barrier
+    nptr[i] = omp_realloc(ptr[i], 0, a, f_a);
+    #pragma omp barrier
+    printf("th %d, nptr %p\n", i, nptr[i]);
+    omp_free(nptr[i], a);
+  }
+
+  // Both ptr pointers should be non-NULL
+  if (ptr[0] == NULL || ptr[1] == NULL) {
+    printf("failed: pointers %p %p\n", ptr[0], ptr[1]);
+    return 1;
+  }
+  // Both nptr pointers should be NULL
+  if (nptr[0] != NULL || nptr[1] != NULL) {
+    printf("failed: pointers %p %p\n", nptr[0], nptr[1]);
+    return 1;
+  }
+  printf("passed\n");
+  return 0;
+}
diff --git a/runtime/test/barrier/omp_barrier.c b/runtime/test/barrier/omp_barrier.c
index a3fb06086..3da70db92 100644
--- a/runtime/test/barrier/omp_barrier.c
+++ b/runtime/test/barrier/omp_barrier.c
@@ -1,4 +1,7 @@
 // RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_BLOCKTIME=infinite %libomp-run
+// RUN: %libomp-compile && env KMP_PLAIN_BARRIER_PATTERN='hierarchical,hierarchical' KMP_FORKJOIN_BARRIER_PATTERN='hierarchical,hierarchical' %libomp-run
+// RUN: %libomp-compile && env KMP_BLOCKTIME=infinite KMP_PLAIN_BARRIER_PATTERN='hierarchical,hierarchical' KMP_FORKJOIN_BARRIER_PATTERN='hierarchical,hierarchical' %libomp-run
 #include <stdio.h>
 #include "omp_testsuite.h"
 #include "omp_my_sleep.h"
diff --git a/runtime/test/bolt/interop/init_then_openmp.c b/runtime/test/bolt/interop/init_then_openmp.c
new file mode 100644
index 000000000..7db1bf3fe
--- /dev/null
+++ b/runtime/test/bolt/interop/init_then_openmp.c
@@ -0,0 +1,37 @@
+// RUN: %libomp-compile-and-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include <stdio.h>
+
+int test_init_then_openmp(int num_init) {
+  int i;
+  int val = 0;
+
+  for (i = 0; i < num_init; i++) {
+    ABT_EXIT_IF_FAIL(ABT_init(0, 0));
+  }
+
+  #pragma omp parallel num_threads(NUM_TASKS)
+  {
+    #pragma omp master
+    { val = 1; }
+  }
+
+  for (i = 0; i < num_init; i++) {
+    ABT_EXIT_IF_FAIL(ABT_finalize());
+  }
+
+  return val;
+}
+
+int main() {
+  int i;
+  int num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    // Note that Argobots will be initialized once BOLT is instantiated.
+    if (!test_init_then_openmp(i + 1)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/interop/openmp_then_init.c b/runtime/test/bolt/interop/openmp_then_init.c
new file mode 100644
index 000000000..dc5635a7e
--- /dev/null
+++ b/runtime/test/bolt/interop/openmp_then_init.c
@@ -0,0 +1,37 @@
+// RUN: %libomp-compile-and-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include <stdio.h>
+
+int test_openmp_then_init(int num_init) {
+  int i;
+  int val = 0;
+
+  #pragma omp parallel num_threads(NUM_TASKS)
+  {
+    #pragma omp master
+    {
+      int initialized = (ABT_initialized() == ABT_SUCCESS);
+      for (i = 0; i < num_init; i++) {
+        ABT_EXIT_IF_FAIL(ABT_init(0, 0));
+      }
+      val = initialized ? 1 : 0;
+      for (i = 0; i < num_init; i++) {
+        ABT_EXIT_IF_FAIL(ABT_finalize());
+      }
+    }
+  }
+  return val;
+}
+
+int main() {
+  int i;
+  int num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    // Note that Argobots will be initialized once BOLT is instantiated.
+    if (!test_openmp_then_init(i + 1)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/misc_bugs/untied_tasks.c b/runtime/test/bolt/misc_bugs/untied_tasks.c
new file mode 100644
index 000000000..88e8edeba
--- /dev/null
+++ b/runtime/test/bolt/misc_bugs/untied_tasks.c
@@ -0,0 +1,61 @@
+// RUN: %libomp-compile-and-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+
+int test_omp_untied_tasks()
+{
+  // https://github.com/pmodels/bolt/issues/49
+  int val = 0;
+  #pragma omp parallel
+  #pragma omp master
+  {
+    #pragma omp task untied
+    { val = 1; }
+  }
+  return val;
+}
+
+int test_omp_tied_tasks()
+{
+  int val = 0;
+  #pragma omp parallel
+  #pragma omp master
+  {
+    #pragma omp task
+    { val = 1; }
+  }
+  return val;
+}
+
+int test_omp_tied_and_untied_tasks()
+{
+  int val1 = 0;
+  int val2 = 0;
+  #pragma omp parallel
+  #pragma omp master
+  {
+    #pragma omp task
+    { val1 = 1; }
+    #pragma omp task untied
+    { val2 = 1; }
+  }
+  return val1 == 1 && val2 == 1;
+}
+
+int main()
+{
+  int i;
+  int num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_omp_untied_tasks()) {
+      num_failed++;
+    }
+    if (!test_omp_tied_tasks()) {
+      num_failed++;
+    }
+    if (!test_omp_tied_and_untied_tasks()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/bolt_scheduling_util.h b/runtime/test/bolt/scheduling/bolt_scheduling_util.h
new file mode 100644
index 000000000..8e8ef15fd
--- /dev/null
+++ b/runtime/test/bolt/scheduling/bolt_scheduling_util.h
@@ -0,0 +1,77 @@
+
+#ifndef BOLT_SCHEDULING_UTIL_H
+#define BOLT_SCHEDULING_UTIL_H
+
+#include "omp_testsuite.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sched.h>
+
+void check_num_ess(int desired) {
+  int num_xstreams;
+  ABT_EXIT_IF_FAIL(ABT_xstream_get_num(&num_xstreams));
+  if (num_xstreams != desired) {
+    printf("check_num_ess: num_xstreams (%d) != desired (%d)\n", num_xstreams,
+           desired);
+    exit(1);
+  }
+}
+
+typedef struct {
+  int counter, flag;
+} timeout_barrier_t;
+
+void timeout_barrier_init(timeout_barrier_t *barrier) {
+  barrier->counter = 0;
+  barrier->flag = 0;
+}
+
+void timeout_barrier_wait(timeout_barrier_t *barrier, int num_waiters) {
+  const int timeout_ms = 4000;
+  const int wait_ms = 200;
+
+  // return 1 if failed.
+  int *p_counter = &barrier->counter;
+  int *p_flag = &barrier->flag;
+
+  if (__atomic_add_fetch(p_counter, 1, __ATOMIC_ACQ_REL) == num_waiters) {
+    double start_time = omp_get_wtime();
+    while (omp_get_wtime() < start_time + wait_ms / 1000.0) {
+      if (__atomic_load_n(p_counter, __ATOMIC_ACQUIRE) != num_waiters) {
+        printf("timeout_barrier_wait: # of arrivals > num_waiters (%d)\n", num_waiters);
+        exit(1);
+      }
+      sched_yield();
+    }
+    // Going back to the normal barrier implementation.
+    __atomic_store_n(p_flag, 1, __ATOMIC_RELEASE);
+    // wait until current_counter gets 1
+    do {
+      // This does not require timeout.
+      sched_yield();
+    } while (__atomic_load_n(p_counter, __ATOMIC_ACQUIRE) != 1);
+    // update a flag again.
+    __atomic_store_n(p_counter, 0, __ATOMIC_RELEASE);
+    __atomic_store_n(p_flag, 0, __ATOMIC_RELEASE);
+  } else {
+    double start_time = omp_get_wtime();
+    do {
+      if (omp_get_wtime() > start_time + (timeout_ms + wait_ms) / 1000.0) {
+        printf("timeout_barrier_wait: timeout expires (%d)\n",
+               (int)__atomic_load_n(p_counter, __ATOMIC_ACQUIRE));
+        exit(1);
+      }
+      sched_yield();
+    } while (__atomic_load_n(p_flag, __ATOMIC_ACQUIRE) == 0);
+    // now p_flag is 1. Let's decrease the counter.
+    __atomic_sub_fetch(p_counter, 1, __ATOMIC_ACQ_REL);
+    // wait until p_flag gets 0.
+    do {
+      // This does not require timeout.
+      sched_yield();
+    } while (__atomic_load_n(p_flag, __ATOMIC_ACQUIRE) == 1);
+  }
+}
+
+#endif // BOLT_SCHEDULING_UTIL_H
diff --git a/runtime/test/bolt/scheduling/for_nowait_scheduling.c b/runtime/test/bolt/scheduling/for_nowait_scheduling.c
new file mode 100644
index 000000000..7344490bf
--- /dev/null
+++ b/runtime/test/bolt/scheduling/for_nowait_scheduling.c
@@ -0,0 +1,47 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+
+int test_for_nowait_scheduling() {
+  int i, vals[4];
+  memset(vals, 0, sizeof(int) * 4);
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  #pragma omp parallel num_threads(4)
+  {
+    check_num_ess(4);
+    int tid = omp_get_thread_num();
+    #pragma omp for nowait
+    for (i = 0; i < 4; i++) {
+      if (tid < 2) {
+        timeout_barrier_wait(&barrier, 4);
+      }
+    }
+    if (tid >= 2) {
+      // The following barrier must be synchronized with the "for" above.
+      timeout_barrier_wait(&barrier, 4);
+    }
+    vals[omp_get_thread_num()] = 1;
+  }
+
+  for (i = 0; i < 4; i++) {
+    if (vals[i] != 1) {
+      printf("vals[%d] == %d\n", i, vals[i]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 1; i < REPETITIONS; i++) {
+    if (!test_for_nowait_scheduling(i)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/task_tied_scheduling.c b/runtime/test/bolt/scheduling/task_tied_scheduling.c
new file mode 100644
index 000000000..9aeba82e7
--- /dev/null
+++ b/runtime/test/bolt/scheduling/task_tied_scheduling.c
@@ -0,0 +1,49 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+
+int test_task_tied_scheduling() {
+  int i, vals[6];
+  memset(vals, 0, sizeof(int) * 6);
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  #pragma omp parallel num_threads(4)
+  {
+    // 6 barrier_waits in tasks and 2 barrier_waits in threads
+    #pragma omp master
+    {
+      check_num_ess(4);
+      for (i = 0; i < 6; i++) {
+        #pragma omp task firstprivate(i)
+        {
+          timeout_barrier_wait(&barrier, 4);
+          vals[i] = 1;
+        }
+      }
+    }
+    if (omp_get_thread_num() < 2) {
+      timeout_barrier_wait(&barrier, 4);
+    }
+  }
+
+  for (i = 0; i < 6; i++) {
+    if (vals[i] != 1) {
+      printf("vals[%d] == %d\n", i, vals[i]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_task_tied_scheduling(i)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/task_tied_thread_scheduling.c b/runtime/test/bolt/scheduling/task_tied_thread_scheduling.c
new file mode 100644
index 000000000..057c47ce6
--- /dev/null
+++ b/runtime/test/bolt/scheduling/task_tied_thread_scheduling.c
@@ -0,0 +1,76 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+
+int test_task_tied_thread_scheduling(int num_threads) {
+  int vals[num_threads * num_threads];
+  memset(vals, 0, sizeof(int) * num_threads * num_threads);
+  omp_set_max_active_levels(2);
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  #pragma omp parallel num_threads(num_threads)
+  #pragma omp master
+  {
+    check_num_ess(4);
+    int i;
+    for (i = 0; i < num_threads; i++) {
+      #pragma omp task firstprivate(i)
+      {
+        #pragma omp parallel num_threads(num_threads)
+        {
+          if (omp_get_thread_num() == 1) {
+            // We should not block a master thread since it might need to create
+            // other outer tasks.
+            timeout_barrier_wait(&barrier, 4);
+          }
+          vals[i * num_threads + omp_get_thread_num()] += 1;
+        }
+      }
+    }
+  }
+
+  #pragma omp parallel num_threads(num_threads)
+  #pragma omp master
+  {
+    check_num_ess(4);
+    int i;
+    for (i = 0; i < num_threads; i++) {
+      #pragma omp task firstprivate(i)
+      {
+        int j;
+        #pragma omp parallel for num_threads(num_threads)
+        for (j = 0; j < num_threads; j++) {
+          if (omp_get_thread_num() == 1) {
+            // We should not block a master thread since it might need to create
+            // other outer tasks.
+            timeout_barrier_wait(&barrier, 4);
+          }
+          vals[i * num_threads + j] += 2;
+        }
+      }
+    }
+  }
+
+  int index;
+  for (index = 0; index < num_threads * num_threads; index++) {
+    if (vals[index] != 3) {
+      printf("vals[%d] == %d\n", index, vals[index]);
+      return 0;
+    }
+  }
+
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 1; i < 3; i++) {
+    if (!test_task_tied_thread_scheduling(i * 4)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/task_unitied_scheduling.c b/runtime/test/bolt/scheduling/task_unitied_scheduling.c
new file mode 100644
index 000000000..290997e99
--- /dev/null
+++ b/runtime/test/bolt/scheduling/task_unitied_scheduling.c
@@ -0,0 +1,69 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+
+int test_task_untied_scheduling() {
+  int i, vals[6];
+  memset(vals, 0, sizeof(int) * 6);
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  #pragma omp parallel num_threads(4)
+  {
+    // 6 barrier_waits in tasks and 2 barrier_waits in threads
+    #pragma omp master
+    {
+      check_num_ess(4);
+      for (i = 0; i < 6; i++) {
+        #pragma omp task firstprivate(i) untied
+        {
+          timeout_barrier_wait(&barrier, 4);
+          vals[i] = 1;
+        }
+      }
+    }
+    if (omp_get_thread_num() < 2) {
+      timeout_barrier_wait(&barrier, 4);
+    }
+  }
+
+  #pragma omp parallel num_threads(4)
+  {
+    // 6 barrier_waits in tasks and 2 barrier_waits in threads
+    #pragma omp master
+    {
+      check_num_ess(4);
+      for (i = 0; i < 6; i++) {
+        #pragma omp task firstprivate(i) untied
+        {
+          #pragma omp taskyield
+          timeout_barrier_wait(&barrier, 4);
+          vals[i] += 2;
+        }
+      }
+    }
+    if (omp_get_thread_num() < 2) {
+      timeout_barrier_wait(&barrier, 4);
+    }
+  }
+
+  for (i = 0; i < 6; i++) {
+    if (vals[i] != 3) {
+      printf("vals[%d] == %d\n", i, vals[i]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_task_untied_scheduling()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/task_untied_thread_scheduling.c b/runtime/test/bolt/scheduling/task_untied_thread_scheduling.c
new file mode 100644
index 000000000..194eebf92
--- /dev/null
+++ b/runtime/test/bolt/scheduling/task_untied_thread_scheduling.c
@@ -0,0 +1,74 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+
+int test_task_untied_thread_scheduling(int num_threads) {
+  int vals[num_threads * num_threads];
+  memset(vals, 0, sizeof(int) * num_threads * num_threads);
+  omp_set_max_active_levels(2);
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  #pragma omp parallel num_threads(num_threads)
+  #pragma omp master
+  {
+    check_num_ess(4);
+    int i;
+    for (i = 0; i < num_threads; i++) {
+      #pragma omp task firstprivate(i) untied
+      {
+        #pragma omp parallel num_threads(num_threads)
+        {
+          if (omp_get_thread_num() == 0) {
+            // We can block a master thread since the associated task is untied
+            timeout_barrier_wait(&barrier, 4);
+          }
+          vals[i * num_threads + omp_get_thread_num()] += 1;
+        }
+      }
+    }
+  }
+
+  #pragma omp parallel num_threads(num_threads)
+  #pragma omp master
+  {
+    check_num_ess(4);
+    int i;
+    for (i = 0; i < num_threads; i++) {
+      #pragma omp task firstprivate(i) untied
+      {
+        int j;
+        #pragma omp parallel for num_threads(num_threads)
+        for (j = 0; j < num_threads; j++) {
+          if (omp_get_thread_num() == 0) {
+            // We can block a master thread since the associated task is untied
+            timeout_barrier_wait(&barrier, 4);
+          }
+          vals[i * num_threads + j] += 2;
+        }
+      }
+    }
+  }
+
+  int index;
+  for (index = 0; index < num_threads * num_threads; index++) {
+    if (vals[index] != 3) {
+      printf("vals[%d] == %d\n", index, vals[index]);
+      return 0;
+    }
+  }
+
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 1; i < 3; i++) {
+    if (!test_task_untied_thread_scheduling(i * 4)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/taskdep_taskgroup_tied_scheduling.c b/runtime/test/bolt/scheduling/taskdep_taskgroup_tied_scheduling.c
new file mode 100644
index 000000000..ea2989a20
--- /dev/null
+++ b/runtime/test/bolt/scheduling/taskdep_taskgroup_tied_scheduling.c
@@ -0,0 +1,123 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int calc_seq(int n) {
+  int i, j, *buffer = (int *)malloc(sizeof(int) * n * n);
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < n; j++) {
+      if (i == 0 && j == 0) {
+        buffer[i * n + j] = 1;
+      } else if (i == 0) {
+        buffer[i * n + j] = buffer[i * n + (j - 1)];
+      } else if (j == 0) {
+        buffer[i * n + j] = buffer[(i - 1) * n + j];
+      } else {
+        buffer[i * n + j] = buffer[(i - 1) * n + j] + buffer[i * n + (j - 1)];
+      }
+    }
+  }
+  int ret = buffer[(n - 1) * n + (n - 1)];
+  free(buffer);
+  return ret;
+}
+
+int test_taskdep_taskgroup_tied_scheduilng() {
+  int n = 6;
+  int seq_val, task_val;
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  #pragma omp parallel shared(task_val) firstprivate(n) num_threads(4)
+  {
+    #pragma omp master
+    {
+      // 6 ( = n) barrier_waits in diagonal tasks and 2 barrier_waits in threads
+      check_num_ess(4);
+      int i, j;
+      int *A_buf = (int *)malloc(sizeof(int) * n * n);
+      int **A = (int **)malloc(sizeof(int *) * n);
+      for(i = 0; i < n; i++) {
+        A[i] = A_buf + (i * n);
+        for(j = 0; j < n; j++) {
+          // Assign random values.
+          A[i][j] = i * n + j;
+        }
+      }
+      #pragma omp taskgroup
+      {
+        // A[i][j] is the root task.
+        for(i = 0; i < n; i++) {
+          for(j = 0; j < n; j++) {
+            if (i == 0 && j == 0) {
+              #pragma omp task depend(out:A[i][j]) firstprivate(A, i, j)
+              {
+                if (i + j == n - 1) {
+                  timeout_barrier_wait(&barrier, 4);
+                }
+                A[i][j] = 1;
+              }
+            } else if (i == 0) {
+              #pragma omp task depend(in:A[i][j - 1]) depend(out:A[i][j]) \
+                               firstprivate(A, i, j)
+              {
+                if (i + j == n - 1) {
+                  timeout_barrier_wait(&barrier, 4);
+                }
+                A[i][j] = A[i][j - 1];
+              }
+            } else if (j == 0) {
+              #pragma omp task depend(in:A[i - 1][j]) depend(out:A[i][j]) \
+                               firstprivate(A, i, j)
+              {
+                if (i + j == n - 1) {
+                  timeout_barrier_wait(&barrier, 4);
+                }
+                A[i][j] = A[i - 1][j];
+              }
+            } else {
+              #pragma omp task depend(in:A[i - 1][j], A[i][j - 1]) \
+                               depend(out:A[i][j])
+              {
+                if (i + j == n - 1) {
+                  timeout_barrier_wait(&barrier, 4);
+                }
+                A[i][j] = A[i - 1][j] + A[i][j - 1];
+              }
+            }
+          }
+        }
+      }
+      task_val = A[n - 1][n - 1];
+      free(A);
+      free(A_buf);
+    }
+    if (omp_get_thread_num() >= 2) {
+      // The master thread needs to wait for tasks, so non-master threads should
+      // run it.
+      timeout_barrier_wait(&barrier, 4);
+    }
+  }
+
+  seq_val = calc_seq(n);
+  if(seq_val != task_val) {
+    printf("Failed: route(%d) = %d (ANS = %d)\n", n, task_val, seq_val);
+    return 0;
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_taskdep_taskgroup_tied_scheduilng()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/taskdep_taskgroup_untied_scheduling.c b/runtime/test/bolt/scheduling/taskdep_taskgroup_untied_scheduling.c
new file mode 100644
index 000000000..e1ba4f498
--- /dev/null
+++ b/runtime/test/bolt/scheduling/taskdep_taskgroup_untied_scheduling.c
@@ -0,0 +1,123 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int calc_seq(int n) {
+  int i, j, *buffer = (int *)malloc(sizeof(int) * n * n);
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < n; j++) {
+      if (i == 0 && j == 0) {
+        buffer[i * n + j] = 1;
+      } else if (i == 0) {
+        buffer[i * n + j] = buffer[i * n + (j - 1)];
+      } else if (j == 0) {
+        buffer[i * n + j] = buffer[(i - 1) * n + j];
+      } else {
+        buffer[i * n + j] = buffer[(i - 1) * n + j] + buffer[i * n + (j - 1)];
+      }
+    }
+  }
+  int ret = buffer[(n - 1) * n + (n - 1)];
+  free(buffer);
+  return ret;
+}
+
+int test_taskdep_taskgroup_untied_scheduilng() {
+  int n = 6;
+  int seq_val, task_val;
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  #pragma omp parallel shared(task_val) firstprivate(n) num_threads(4)
+  {
+    #pragma omp master
+    {
+      // 6 ( = n) barrier_waits in diagonal tasks and 2 barrier_waits in threads
+      check_num_ess(4);
+      int i, j;
+      int *A_buf = (int *)malloc(sizeof(int) * n * n);
+      int **A = (int **)malloc(sizeof(int *) * n);
+      for(i = 0; i < n; i++) {
+        A[i] = A_buf + (i * n);
+        for(j = 0; j < n; j++) {
+          // Assign random values.
+          A[i][j] = i * n + j;
+        }
+      }
+      #pragma omp taskgroup
+      {
+        // A[i][j] is the root task.
+        for(i = 0; i < n; i++) {
+          for(j = 0; j < n; j++) {
+            if (i == 0 && j == 0) {
+              #pragma omp task depend(out:A[i][j]) firstprivate(A, i, j) untied
+              {
+                if (i + j == n - 1) {
+                  timeout_barrier_wait(&barrier, 4);
+                }
+                A[i][j] = 1;
+              }
+            } else if (i == 0) {
+              #pragma omp task depend(in:A[i][j - 1]) depend(out:A[i][j]) \
+                               firstprivate(A, i, j) untied
+              {
+                if (i + j == n - 1) {
+                  timeout_barrier_wait(&barrier, 4);
+                }
+                A[i][j] = A[i][j - 1];
+              }
+            } else if (j == 0) {
+              #pragma omp task depend(in:A[i - 1][j]) depend(out:A[i][j]) \
+                               firstprivate(A, i, j) untied
+              {
+                if (i + j == n - 1) {
+                  timeout_barrier_wait(&barrier, 4);
+                }
+                A[i][j] = A[i - 1][j];
+              }
+            } else {
+              #pragma omp task depend(in:A[i - 1][j], A[i][j - 1]) \
+                               depend(out:A[i][j]) untied
+              {
+                if (i + j == n - 1) {
+                  timeout_barrier_wait(&barrier, 4);
+                }
+                A[i][j] = A[i - 1][j] + A[i][j - 1];
+              }
+            }
+          }
+        }
+      }
+      task_val = A[n - 1][n - 1];
+      free(A);
+      free(A_buf);
+    }
+    if (omp_get_thread_num() >= 2) {
+      // The master thread needs to wait for tasks, so non-master threads should
+      // run it.
+      timeout_barrier_wait(&barrier, 4);
+    }
+  }
+
+  seq_val = calc_seq(n);
+  if(seq_val != task_val) {
+    printf("Failed: route(%d) = %d (ANS = %d)\n", n, task_val, seq_val);
+    return 0;
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_taskdep_taskgroup_untied_scheduilng()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/taskdep_taskgroup_untied_yield_scheduling.c b/runtime/test/bolt/scheduling/taskdep_taskgroup_untied_yield_scheduling.c
new file mode 100644
index 000000000..b2dc1553b
--- /dev/null
+++ b/runtime/test/bolt/scheduling/taskdep_taskgroup_untied_yield_scheduling.c
@@ -0,0 +1,127 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int calc_seq(int n) {
+  int i, j, *buffer = (int *)malloc(sizeof(int) * n * n);
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < n; j++) {
+      if (i == 0 && j == 0) {
+        buffer[i * n + j] = 1;
+      } else if (i == 0) {
+        buffer[i * n + j] = buffer[i * n + (j - 1)];
+      } else if (j == 0) {
+        buffer[i * n + j] = buffer[(i - 1) * n + j];
+      } else {
+        buffer[i * n + j] = buffer[(i - 1) * n + j] + buffer[i * n + (j - 1)];
+      }
+    }
+  }
+  int ret = buffer[(n - 1) * n + (n - 1)];
+  free(buffer);
+  return ret;
+}
+
+int test_taskdep_taskgroup_untied_yield_scheduilng() {
+  int n = 6;
+  int seq_val, task_val;
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  #pragma omp parallel shared(task_val) firstprivate(n) num_threads(4)
+  {
+    #pragma omp master
+    {
+      // 6 ( = n) barrier_waits in diagonal tasks and 2 barrier_waits in threads
+      check_num_ess(4);
+      int i, j;
+      int *A_buf = (int *)malloc(sizeof(int) * n * n);
+      int **A = (int **)malloc(sizeof(int *) * n);
+      for(i = 0; i < n; i++) {
+        A[i] = A_buf + (i * n);
+        for(j = 0; j < n; j++) {
+          // Assign random values.
+          A[i][j] = i * n + j;
+        }
+      }
+      #pragma omp taskgroup
+      {
+        // A[i][j] is the root task.
+        for(i = 0; i < n; i++) {
+          for(j = 0; j < n; j++) {
+            if (i == 0 && j == 0) {
+              #pragma omp task depend(out:A[i][j]) firstprivate(A, i, j) untied
+              {
+                if (i + j == n - 1) {
+                  #pragma omp taskyield
+                  timeout_barrier_wait(&barrier, 4);
+                }
+                A[i][j] = 1;
+              }
+            } else if (i == 0) {
+              #pragma omp task depend(in:A[i][j - 1]) depend(out:A[i][j]) \
+                               firstprivate(A, i, j) untied
+              {
+                if (i + j == n - 1) {
+                  #pragma omp taskyield
+                  timeout_barrier_wait(&barrier, 4);
+                }
+                A[i][j] = A[i][j - 1];
+              }
+            } else if (j == 0) {
+              #pragma omp task depend(in:A[i - 1][j]) depend(out:A[i][j]) \
+                               firstprivate(A, i, j) untied
+              {
+                if (i + j == n - 1) {
+                  #pragma omp taskyield
+                  timeout_barrier_wait(&barrier, 4);
+                }
+                A[i][j] = A[i - 1][j];
+              }
+            } else {
+              #pragma omp task depend(in:A[i - 1][j], A[i][j - 1]) \
+                               depend(out:A[i][j]) untied
+              {
+                if (i + j == n - 1) {
+                  #pragma omp taskyield
+                  timeout_barrier_wait(&barrier, 4);
+                }
+                A[i][j] = A[i - 1][j] + A[i][j - 1];
+              }
+            }
+          }
+        }
+      }
+      task_val = A[n - 1][n - 1];
+      free(A);
+      free(A_buf);
+    }
+    if (omp_get_thread_num() >= 2) {
+      // The master thread needs to wait for tasks, so non-master threads should
+      // run it.
+      timeout_barrier_wait(&barrier, 4);
+    }
+  }
+
+  seq_val = calc_seq(n);
+  if(seq_val != task_val) {
+    printf("Failed: route(%d) = %d (ANS = %d)\n", n, task_val, seq_val);
+    return 0;
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_taskdep_taskgroup_untied_yield_scheduilng()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/taskdep_taskwait_tied_scheduling.c b/runtime/test/bolt/scheduling/taskdep_taskwait_tied_scheduling.c
new file mode 100644
index 000000000..41ce313f5
--- /dev/null
+++ b/runtime/test/bolt/scheduling/taskdep_taskwait_tied_scheduling.c
@@ -0,0 +1,121 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int calc_seq(int n) {
+  int i, j, *buffer = (int *)malloc(sizeof(int) * n * n);
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < n; j++) {
+      if (i == 0 && j == 0) {
+        buffer[i * n + j] = 1;
+      } else if (i == 0) {
+        buffer[i * n + j] = buffer[i * n + (j - 1)];
+      } else if (j == 0) {
+        buffer[i * n + j] = buffer[(i - 1) * n + j];
+      } else {
+        buffer[i * n + j] = buffer[(i - 1) * n + j] + buffer[i * n + (j - 1)];
+      }
+    }
+  }
+  int ret = buffer[(n - 1) * n + (n - 1)];
+  free(buffer);
+  return ret;
+}
+
+int test_taskdep_taskwait_tied_scheduilng() {
+  int n = 6;
+  int seq_val, task_val;
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  #pragma omp parallel shared(task_val) firstprivate(n) num_threads(4)
+  {
+    #pragma omp master
+    {
+      // 6 ( = n) barrier_waits in diagonal tasks and 2 barrier_waits in threads
+      check_num_ess(4);
+      int i, j;
+      int *A_buf = (int *)malloc(sizeof(int) * n * n);
+      int **A = (int **)malloc(sizeof(int *) * n);
+      for(i = 0; i < n; i++) {
+        A[i] = A_buf + (i * n);
+        for(j = 0; j < n; j++) {
+          // Assign random values.
+          A[i][j] = i * n + j;
+        }
+      }
+      // A[i][j] is the root task.
+      for(i = 0; i < n; i++) {
+        for(j = 0; j < n; j++) {
+          if (i == 0 && j == 0) {
+            #pragma omp task depend(out:A[i][j]) firstprivate(A, i, j)
+            {
+              if (i + j == n - 1) {
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = 1;
+            }
+          } else if (i == 0) {
+            #pragma omp task depend(in:A[i][j - 1]) depend(out:A[i][j]) \
+                             firstprivate(A, i, j)
+            {
+              if (i + j == n - 1) {
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = A[i][j - 1];
+            }
+          } else if (j == 0) {
+            #pragma omp task depend(in:A[i - 1][j]) depend(out:A[i][j]) \
+                             firstprivate(A, i, j)
+            {
+              if (i + j == n - 1) {
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = A[i - 1][j];
+            }
+          } else {
+            #pragma omp task depend(in:A[i - 1][j], A[i][j - 1]) \
+                             depend(out:A[i][j])
+            {
+              if (i + j == n - 1) {
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = A[i - 1][j] + A[i][j - 1];
+            }
+          }
+        }
+      }
+      #pragma omp taskwait
+      task_val = A[n - 1][n - 1];
+      free(A);
+      free(A_buf);
+    }
+    if (omp_get_thread_num() >= 2) {
+      // The master thread needs to wait for tasks, so non-master threads should
+      // run it.
+      timeout_barrier_wait(&barrier, 4);
+    }
+  }
+
+  seq_val = calc_seq(n);
+  if(seq_val != task_val) {
+    printf("Failed: route(%d) = %d (ANS = %d)\n", n, task_val, seq_val);
+    return 0;
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_taskdep_taskwait_tied_scheduilng()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/taskdep_taskwait_untied_scheduling.c b/runtime/test/bolt/scheduling/taskdep_taskwait_untied_scheduling.c
new file mode 100644
index 000000000..3e24bd872
--- /dev/null
+++ b/runtime/test/bolt/scheduling/taskdep_taskwait_untied_scheduling.c
@@ -0,0 +1,121 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int calc_seq(int n) {
+  int i, j, *buffer = (int *)malloc(sizeof(int) * n * n);
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < n; j++) {
+      if (i == 0 && j == 0) {
+        buffer[i * n + j] = 1;
+      } else if (i == 0) {
+        buffer[i * n + j] = buffer[i * n + (j - 1)];
+      } else if (j == 0) {
+        buffer[i * n + j] = buffer[(i - 1) * n + j];
+      } else {
+        buffer[i * n + j] = buffer[(i - 1) * n + j] + buffer[i * n + (j - 1)];
+      }
+    }
+  }
+  int ret = buffer[(n - 1) * n + (n - 1)];
+  free(buffer);
+  return ret;
+}
+
+int test_taskdep_taskwait_untied_scheduilng() {
+  int n = 6;
+  int seq_val, task_val;
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  #pragma omp parallel shared(task_val) firstprivate(n) num_threads(4)
+  {
+    #pragma omp master
+    {
+      // 6 ( = n) barrier_waits in diagonal tasks and 2 barrier_waits in threads
+      check_num_ess(4);
+      int i, j;
+      int *A_buf = (int *)malloc(sizeof(int) * n * n);
+      int **A = (int **)malloc(sizeof(int *) * n);
+      for(i = 0; i < n; i++) {
+        A[i] = A_buf + (i * n);
+        for(j = 0; j < n; j++) {
+          // Assign random values.
+          A[i][j] = i * n + j;
+        }
+      }
+      // A[i][j] is the root task.
+      for(i = 0; i < n; i++) {
+        for(j = 0; j < n; j++) {
+          if (i == 0 && j == 0) {
+            #pragma omp task depend(out:A[i][j]) firstprivate(A, i, j) untied
+            {
+              if (i + j == n - 1) {
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = 1;
+            }
+          } else if (i == 0) {
+            #pragma omp task depend(in:A[i][j - 1]) depend(out:A[i][j]) \
+                             firstprivate(A, i, j) untied
+            {
+              if (i + j == n - 1) {
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = A[i][j - 1];
+            }
+          } else if (j == 0) {
+            #pragma omp task depend(in:A[i - 1][j]) depend(out:A[i][j]) \
+                             firstprivate(A, i, j) untied
+            {
+              if (i + j == n - 1) {
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = A[i - 1][j];
+            }
+          } else {
+            #pragma omp task depend(in:A[i - 1][j], A[i][j - 1]) \
+                             depend(out:A[i][j]) untied
+            {
+              if (i + j == n - 1) {
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = A[i - 1][j] + A[i][j - 1];
+            }
+          }
+        }
+      }
+      #pragma omp taskwait
+      task_val = A[n - 1][n - 1];
+      free(A);
+      free(A_buf);
+    }
+    if (omp_get_thread_num() >= 2) {
+      // The master thread needs to wait for tasks, so non-master threads should
+      // run it.
+      timeout_barrier_wait(&barrier, 4);
+    }
+  }
+
+  seq_val = calc_seq(n);
+  if(seq_val != task_val) {
+    printf("Failed: route(%d) = %d (ANS = %d)\n", n, task_val, seq_val);
+    return 0;
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_taskdep_taskwait_untied_scheduilng()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/taskdep_taskwait_untied_yield_scheduling.c b/runtime/test/bolt/scheduling/taskdep_taskwait_untied_yield_scheduling.c
new file mode 100644
index 000000000..c2eb42f2a
--- /dev/null
+++ b/runtime/test/bolt/scheduling/taskdep_taskwait_untied_yield_scheduling.c
@@ -0,0 +1,125 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int calc_seq(int n) {
+  int i, j, *buffer = (int *)malloc(sizeof(int) * n * n);
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < n; j++) {
+      if (i == 0 && j == 0) {
+        buffer[i * n + j] = 1;
+      } else if (i == 0) {
+        buffer[i * n + j] = buffer[i * n + (j - 1)];
+      } else if (j == 0) {
+        buffer[i * n + j] = buffer[(i - 1) * n + j];
+      } else {
+        buffer[i * n + j] = buffer[(i - 1) * n + j] + buffer[i * n + (j - 1)];
+      }
+    }
+  }
+  int ret = buffer[(n - 1) * n + (n - 1)];
+  free(buffer);
+  return ret;
+}
+
+int test_taskdep_taskwait_untied_yield_scheduilng() {
+  int n = 6;
+  int seq_val, task_val;
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  #pragma omp parallel shared(task_val) firstprivate(n) num_threads(4)
+  {
+    #pragma omp master
+    {
+      // 6 ( = n) barrier_waits in diagonal tasks and 2 barrier_waits in threads
+      check_num_ess(4);
+      int i, j;
+      int *A_buf = (int *)malloc(sizeof(int) * n * n);
+      int **A = (int **)malloc(sizeof(int *) * n);
+      for(i = 0; i < n; i++) {
+        A[i] = A_buf + (i * n);
+        for(j = 0; j < n; j++) {
+          // Assign random values.
+          A[i][j] = i * n + j;
+        }
+      }
+      // A[i][j] is the root task.
+      for(i = 0; i < n; i++) {
+        for(j = 0; j < n; j++) {
+          if (i == 0 && j == 0) {
+            #pragma omp task depend(out:A[i][j]) firstprivate(A, i, j) untied
+            {
+              if (i + j == n - 1) {
+                #pragma omp taskyield
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = 1;
+            }
+          } else if (i == 0) {
+            #pragma omp task depend(in:A[i][j - 1]) depend(out:A[i][j]) \
+                             firstprivate(A, i, j) untied
+            {
+              if (i + j == n - 1) {
+                #pragma omp taskyield
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = A[i][j - 1];
+            }
+          } else if (j == 0) {
+            #pragma omp task depend(in:A[i - 1][j]) depend(out:A[i][j]) \
+                             firstprivate(A, i, j) untied
+            {
+              if (i + j == n - 1) {
+                #pragma omp taskyield
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = A[i - 1][j];
+            }
+          } else {
+            #pragma omp task depend(in:A[i - 1][j], A[i][j - 1]) \
+                             depend(out:A[i][j]) untied
+            {
+              if (i + j == n - 1) {
+                #pragma omp taskyield
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = A[i - 1][j] + A[i][j - 1];
+            }
+          }
+        }
+      }
+      #pragma omp taskwait
+      task_val = A[n - 1][n - 1];
+      free(A);
+      free(A_buf);
+    }
+    if (omp_get_thread_num() >= 2) {
+      // The master thread needs to wait for tasks, so non-master threads should
+      // run it.
+      timeout_barrier_wait(&barrier, 4);
+    }
+  }
+
+  seq_val = calc_seq(n);
+  if(seq_val != task_val) {
+    printf("Failed: route(%d) = %d (ANS = %d)\n", n, task_val, seq_val);
+    return 0;
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_taskdep_taskwait_untied_yield_scheduilng()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/taskdep_tied_scheduling.c b/runtime/test/bolt/scheduling/taskdep_tied_scheduling.c
new file mode 100644
index 000000000..1b98bc9cc
--- /dev/null
+++ b/runtime/test/bolt/scheduling/taskdep_tied_scheduling.c
@@ -0,0 +1,122 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int calc_seq(int n) {
+  int i, j, *buffer = (int *)malloc(sizeof(int) * n * n);
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < n; j++) {
+      if (i == 0 && j == 0) {
+        buffer[i * n + j] = 1;
+      } else if (i == 0) {
+        buffer[i * n + j] = buffer[i * n + (j - 1)];
+      } else if (j == 0) {
+        buffer[i * n + j] = buffer[(i - 1) * n + j];
+      } else {
+        buffer[i * n + j] = buffer[(i - 1) * n + j] + buffer[i * n + (j - 1)];
+      }
+    }
+  }
+  int ret = buffer[(n - 1) * n + (n - 1)];
+  free(buffer);
+  return ret;
+}
+
+int test_taskdep_tied_scheduilng() {
+  int n = 6;
+  int seq_val, task_val;
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  int *A_buf = (int *)malloc(sizeof(int) * n * n);
+  int **A = (int **)malloc(sizeof(int *) * n);
+
+  #pragma omp parallel shared(task_val) firstprivate(n) num_threads(4)
+  {
+    #pragma omp master
+    {
+      // 6 ( = n) barrier_waits in diagonal tasks and 2 barrier_waits in threads
+      check_num_ess(4);
+      int i, j;
+      for(i = 0; i < n; i++) {
+        A[i] = A_buf + (i * n);
+        for(j = 0; j < n; j++) {
+          // Assign random values.
+          A[i][j] = i * n + j;
+        }
+      }
+      // A[i][j] is the root task.
+      for(i = 0; i < n; i++) {
+        for(j = 0; j < n; j++) {
+          if (i == 0 && j == 0) {
+            #pragma omp task depend(out:A[i][j]) firstprivate(A, i, j)
+            {
+              if (i + j == n - 1) {
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = 1;
+            }
+          } else if (i == 0) {
+            #pragma omp task depend(in:A[i][j - 1]) depend(out:A[i][j]) \
+                             firstprivate(A, i, j)
+            {
+              if (i + j == n - 1) {
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = A[i][j - 1];
+            }
+          } else if (j == 0) {
+            #pragma omp task depend(in:A[i - 1][j]) depend(out:A[i][j]) \
+                             firstprivate(A, i, j)
+            {
+              if (i + j == n - 1) {
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = A[i - 1][j];
+            }
+          } else {
+            #pragma omp task depend(in:A[i - 1][j], A[i][j - 1]) \
+                             depend(out:A[i][j])
+            {
+              if (i + j == n - 1) {
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = A[i - 1][j] + A[i][j - 1];
+            }
+          }
+        }
+      }
+    }
+    if (omp_get_thread_num() < 2) {
+      // The master thread does not need to wait for tasks, so the master thread
+      // can execute the following after task creation.
+      timeout_barrier_wait(&barrier, 4);
+    }
+  }
+
+  task_val = A[n - 1][n - 1];
+  free(A);
+  free(A_buf);
+
+  seq_val = calc_seq(n);
+  if(seq_val != task_val) {
+    printf("Failed: route(%d) = %d (ANS = %d)\n", n, task_val, seq_val);
+    return 0;
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_taskdep_tied_scheduilng()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/taskdep_untied_scheduling.c b/runtime/test/bolt/scheduling/taskdep_untied_scheduling.c
new file mode 100644
index 000000000..91a220784
--- /dev/null
+++ b/runtime/test/bolt/scheduling/taskdep_untied_scheduling.c
@@ -0,0 +1,122 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int calc_seq(int n) {
+  int i, j, *buffer = (int *)malloc(sizeof(int) * n * n);
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < n; j++) {
+      if (i == 0 && j == 0) {
+        buffer[i * n + j] = 1;
+      } else if (i == 0) {
+        buffer[i * n + j] = buffer[i * n + (j - 1)];
+      } else if (j == 0) {
+        buffer[i * n + j] = buffer[(i - 1) * n + j];
+      } else {
+        buffer[i * n + j] = buffer[(i - 1) * n + j] + buffer[i * n + (j - 1)];
+      }
+    }
+  }
+  int ret = buffer[(n - 1) * n + (n - 1)];
+  free(buffer);
+  return ret;
+}
+
+int test_taskdep_untied_scheduilng() {
+  int n = 6;
+  int seq_val, task_val;
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  int *A_buf = (int *)malloc(sizeof(int) * n * n);
+  int **A = (int **)malloc(sizeof(int *) * n);
+
+  #pragma omp parallel shared(task_val) firstprivate(n) num_threads(4)
+  {
+    #pragma omp master
+    {
+      // 6 ( = n) barrier_waits in diagonal tasks and 2 barrier_waits in threads
+      check_num_ess(4);
+      int i, j;
+      for(i = 0; i < n; i++) {
+        A[i] = A_buf + (i * n);
+        for(j = 0; j < n; j++) {
+          // Assign random values.
+          A[i][j] = i * n + j;
+        }
+      }
+      // A[i][j] is the root task.
+      for(i = 0; i < n; i++) {
+        for(j = 0; j < n; j++) {
+          if (i == 0 && j == 0) {
+            #pragma omp task depend(out:A[i][j]) firstprivate(A, i, j) untied
+            {
+              if (i + j == n - 1) {
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = 1;
+            }
+          } else if (i == 0) {
+            #pragma omp task depend(in:A[i][j - 1]) depend(out:A[i][j]) \
+                             firstprivate(A, i, j) untied
+            {
+              if (i + j == n - 1) {
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = A[i][j - 1];
+            }
+          } else if (j == 0) {
+            #pragma omp task depend(in:A[i - 1][j]) depend(out:A[i][j]) \
+                             firstprivate(A, i, j) untied
+            {
+              if (i + j == n - 1) {
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = A[i - 1][j];
+            }
+          } else {
+            #pragma omp task depend(in:A[i - 1][j], A[i][j - 1]) \
+                             depend(out:A[i][j]) untied
+            {
+              if (i + j == n - 1) {
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = A[i - 1][j] + A[i][j - 1];
+            }
+          }
+        }
+      }
+    }
+    if (omp_get_thread_num() < 2) {
+      // The master thread does not need to wait for tasks, so the master thread
+      // can execute the following after task creation.
+      timeout_barrier_wait(&barrier, 4);
+    }
+  }
+
+  task_val = A[n - 1][n - 1];
+  free(A);
+  free(A_buf);
+
+  seq_val = calc_seq(n);
+  if(seq_val != task_val) {
+    printf("Failed: route(%d) = %d (ANS = %d)\n", n, task_val, seq_val);
+    return 0;
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_taskdep_untied_scheduilng()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/taskdep_untied_yield_scheduling.c b/runtime/test/bolt/scheduling/taskdep_untied_yield_scheduling.c
new file mode 100644
index 000000000..79c187373
--- /dev/null
+++ b/runtime/test/bolt/scheduling/taskdep_untied_yield_scheduling.c
@@ -0,0 +1,126 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int calc_seq(int n) {
+  int i, j, *buffer = (int *)malloc(sizeof(int) * n * n);
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < n; j++) {
+      if (i == 0 && j == 0) {
+        buffer[i * n + j] = 1;
+      } else if (i == 0) {
+        buffer[i * n + j] = buffer[i * n + (j - 1)];
+      } else if (j == 0) {
+        buffer[i * n + j] = buffer[(i - 1) * n + j];
+      } else {
+        buffer[i * n + j] = buffer[(i - 1) * n + j] + buffer[i * n + (j - 1)];
+      }
+    }
+  }
+  int ret = buffer[(n - 1) * n + (n - 1)];
+  free(buffer);
+  return ret;
+}
+
+int test_taskdep_untied_yield_scheduilng() {
+  int n = 6;
+  int seq_val, task_val;
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  int *A_buf = (int *)malloc(sizeof(int) * n * n);
+  int **A = (int **)malloc(sizeof(int *) * n);
+
+  #pragma omp parallel shared(task_val) firstprivate(n) num_threads(4)
+  {
+    #pragma omp master
+    {
+      // 6 ( = n) barrier_waits in diagonal tasks and 2 barrier_waits in threads
+      check_num_ess(4);
+      int i, j;
+      for(i = 0; i < n; i++) {
+        A[i] = A_buf + (i * n);
+        for(j = 0; j < n; j++) {
+          // Assign random values.
+          A[i][j] = i * n + j;
+        }
+      }
+      // A[i][j] is the root task.
+      for(i = 0; i < n; i++) {
+        for(j = 0; j < n; j++) {
+          if (i == 0 && j == 0) {
+            #pragma omp task depend(out:A[i][j]) firstprivate(A, i, j) untied
+            {
+              if (i + j == n - 1) {
+                #pragma omp taskyield
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = 1;
+            }
+          } else if (i == 0) {
+            #pragma omp task depend(in:A[i][j - 1]) depend(out:A[i][j]) \
+                             firstprivate(A, i, j) untied
+            {
+              if (i + j == n - 1) {
+                #pragma omp taskyield
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = A[i][j - 1];
+            }
+          } else if (j == 0) {
+            #pragma omp task depend(in:A[i - 1][j]) depend(out:A[i][j]) \
+                             firstprivate(A, i, j) untied
+            {
+              if (i + j == n - 1) {
+                #pragma omp taskyield
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = A[i - 1][j];
+            }
+          } else {
+            #pragma omp task depend(in:A[i - 1][j], A[i][j - 1]) \
+                             depend(out:A[i][j]) untied
+            {
+              if (i + j == n - 1) {
+                #pragma omp taskyield
+                timeout_barrier_wait(&barrier, 4);
+              }
+              A[i][j] = A[i - 1][j] + A[i][j - 1];
+            }
+          }
+        }
+      }
+    }
+    if (omp_get_thread_num() < 2) {
+      // The master thread does not need to wait for tasks, so the master thread
+      // can execute the following after task creation.
+      timeout_barrier_wait(&barrier, 4);
+    }
+  }
+
+  task_val = A[n - 1][n - 1];
+  free(A);
+  free(A_buf);
+
+  seq_val = calc_seq(n);
+  if(seq_val != task_val) {
+    printf("Failed: route(%d) = %d (ANS = %d)\n", n, task_val, seq_val);
+    return 0;
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_taskdep_untied_yield_scheduilng()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/taskloop_nogroup_tied_scheduling.c b/runtime/test/bolt/scheduling/taskloop_nogroup_tied_scheduling.c
new file mode 100644
index 000000000..ac66be4eb
--- /dev/null
+++ b/runtime/test/bolt/scheduling/taskloop_nogroup_tied_scheduling.c
@@ -0,0 +1,48 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+
+int test_taskloop_nogroup_tied_scheduling() {
+  int i, vals[6];
+  memset(vals, 0, sizeof(int) * 6);
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  #pragma omp parallel num_threads(4)
+  {
+    // 6 barrier_waits in tasks and 2 barrier_waits in threads
+    #pragma omp master
+    {
+      check_num_ess(4);
+      #pragma omp taskloop grainsize(1) nogroup
+      for (i = 0; i < 6; i++) {
+        timeout_barrier_wait(&barrier, 4);
+        vals[i] = 1;
+      }
+    }
+    if (omp_get_thread_num() < 2) {
+      // master does not wait the completion of taskloop.
+      timeout_barrier_wait(&barrier, 4);
+    }
+  }
+
+  for (i = 0; i < 6; i++) {
+    if (vals[i] != 1) {
+      printf("vals[%d] == %d\n", i, vals[i]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_taskloop_nogroup_tied_scheduling()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/taskloop_nogroup_untied_scheduling.c b/runtime/test/bolt/scheduling/taskloop_nogroup_untied_scheduling.c
new file mode 100644
index 000000000..76e8eb685
--- /dev/null
+++ b/runtime/test/bolt/scheduling/taskloop_nogroup_untied_scheduling.c
@@ -0,0 +1,70 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt && !clang
+
+// Clang 10.0 seems ignoring the taskloop's "untied" attribute.
+// We mark taskloop + untied with Clang as unsupported so far.
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+
+int test_taskloop_nogroup_untied_scheduling() {
+  int i, vals[6];
+  memset(vals, 0, sizeof(int) * 6);
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  #pragma omp parallel num_threads(4)
+  {
+    // 6 barrier_waits in tasks and 2 barrier_waits in threads
+    #pragma omp master
+    {
+      check_num_ess(4);
+      #pragma omp taskloop grainsize(1) nogroup untied
+      for (i = 0; i < 6; i++) {
+        timeout_barrier_wait(&barrier, 4);
+        vals[i] = 1;
+      }
+    }
+    if (omp_get_thread_num() < 2) {
+      // master does not wait the completion of taskloop.
+      timeout_barrier_wait(&barrier, 4);
+    }
+  }
+
+  #pragma omp parallel num_threads(4)
+  {
+    // 6 barrier_waits in tasks and 2 barrier_waits in threads
+    #pragma omp master
+    {
+      check_num_ess(4);
+      #pragma omp taskloop grainsize(1) nogroup untied
+      for (i = 0; i < 6; i++) {
+        #pragma omp taskyield
+        timeout_barrier_wait(&barrier, 4);
+        vals[i] = 1;
+      }
+    }
+    if (omp_get_thread_num() < 2) {
+      // master does not wait the completion of taskloop.
+      timeout_barrier_wait(&barrier, 4);
+    }
+  }
+
+  for (i = 0; i < 6; i++) {
+    if (vals[i] != 1) {
+      printf("vals[%d] == %d\n", i, vals[i]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_taskloop_nogroup_untied_scheduling()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/taskloop_tied_scheduling.c b/runtime/test/bolt/scheduling/taskloop_tied_scheduling.c
new file mode 100644
index 000000000..db9eeb95a
--- /dev/null
+++ b/runtime/test/bolt/scheduling/taskloop_tied_scheduling.c
@@ -0,0 +1,47 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+
+int test_taskloop_tied_scheduling() {
+  int i, vals[6];
+  memset(vals, 0, sizeof(int) * 6);
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  #pragma omp parallel num_threads(4)
+  {
+    if (omp_get_thread_num() >= 2) {
+      timeout_barrier_wait(&barrier, 4);
+    }
+    // 6 barrier_waits in tasks and 2 barrier_waits in threads
+    #pragma omp master
+    {
+      check_num_ess(4);
+      #pragma omp taskloop grainsize(1)
+      for (i = 0; i < 6; i++) {
+        timeout_barrier_wait(&barrier, 4);
+        vals[i] = 1;
+      }
+    }
+  }
+
+  for (i = 0; i < 6; i++) {
+    if (vals[i] != 1) {
+      printf("vals[%d] == %d\n", i, vals[i]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_taskloop_tied_scheduling()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/taskloop_untied_scheduling.c b/runtime/test/bolt/scheduling/taskloop_untied_scheduling.c
new file mode 100644
index 000000000..ee118ebc6
--- /dev/null
+++ b/runtime/test/bolt/scheduling/taskloop_untied_scheduling.c
@@ -0,0 +1,68 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt && !clang
+
+// Clang 10.0 seems ignoring the taskloop's "untied" attribute.
+// We mark taskloop + untied with Clang as unsupported so far.
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+
+int test_taskloop_untied_scheduling() {
+  int i, vals[6];
+  memset(vals, 0, sizeof(int) * 6);
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  #pragma omp parallel num_threads(4)
+  {
+    if (omp_get_thread_num() >= 2) {
+      timeout_barrier_wait(&barrier, 4);
+    }
+    // 6 barrier_waits in tasks and 2 barrier_waits in threads
+    #pragma omp master
+    {
+      check_num_ess(4);
+      #pragma omp taskloop grainsize(1) untied
+      for (i = 0; i < 6; i++) {
+        timeout_barrier_wait(&barrier, 4);
+        vals[i] += 1;
+      }
+    }
+  }
+
+  #pragma omp parallel num_threads(4)
+  {
+    if (omp_get_thread_num() >= 2) {
+      timeout_barrier_wait(&barrier, 4);
+    }
+    // 6 barrier_waits in tasks and 2 barrier_waits in threads
+    #pragma omp master
+    {
+      check_num_ess(4);
+      #pragma omp taskloop grainsize(1) untied
+      for (i = 0; i < 6; i++) {
+        #pragma omp taskyield
+        timeout_barrier_wait(&barrier, 4);
+        vals[i] += 2;
+      }
+    }
+  }
+
+  for (i = 0; i < 6; i++) {
+    if (vals[i] != 3) {
+      printf("vals[%d] == %d\n", i, vals[i]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_taskloop_untied_scheduling()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/thread_scheduling.c b/runtime/test/bolt/scheduling/thread_scheduling.c
new file mode 100644
index 000000000..d4ef16fff
--- /dev/null
+++ b/runtime/test/bolt/scheduling/thread_scheduling.c
@@ -0,0 +1,46 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+
+int test_thread_scheduling(int num_threads) {
+  int i, vals[num_threads];
+  memset(vals, 0, sizeof(int) * num_threads);
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  #pragma omp parallel num_threads(num_threads)
+  {
+    check_num_ess(4);
+    // The barrier must be run by all ESs.
+    timeout_barrier_wait(&barrier, 4);
+    vals[omp_get_thread_num()] += 1;
+  }
+
+  #pragma omp parallel for num_threads(num_threads)
+  for (i = 0; i < num_threads; i++) {
+    check_num_ess(4);
+    // The barrier must be run by all ESs.
+    timeout_barrier_wait(&barrier, 4);
+    vals[i] += 2;
+  }
+
+  for (i = 0; i < num_threads; i++) {
+    if (vals[i] != 3) {
+      printf("vals[%d] == %d\n", i, vals[i]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 1; i < 4; i++) {
+    if (!test_thread_scheduling(i * 4)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/scheduling/thread_thread_scheduling.c b/runtime/test/bolt/scheduling/thread_thread_scheduling.c
new file mode 100644
index 000000000..923a9d025
--- /dev/null
+++ b/runtime/test/bolt/scheduling/thread_thread_scheduling.c
@@ -0,0 +1,58 @@
+// RUN: %libomp-compile && env KMP_ABT_NUM_ESS=4 %libomp-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include "bolt_scheduling_util.h"
+
+int test_thread_thread_scheduling(int num_threads) {
+  int i, vals[num_threads * num_threads];
+  memset(vals, 0, sizeof(int) * num_threads * num_threads);
+  omp_set_max_active_levels(2);
+
+  timeout_barrier_t barrier;
+  timeout_barrier_init(&barrier);
+
+  #pragma omp parallel num_threads(num_threads)
+  {
+    check_num_ess(4);
+    int parent_tid = omp_get_thread_num();
+    #pragma omp parallel num_threads(num_threads)
+    {
+      if (parent_tid == omp_get_thread_num()) {
+        timeout_barrier_wait(&barrier, 4);
+      }
+      vals[parent_tid * num_threads + omp_get_thread_num()] += 1;
+    }
+  }
+
+  #pragma omp parallel for num_threads(num_threads)
+  for (i = 0; i < num_threads; i++) {
+    check_num_ess(4);
+    int j, parent_i = i;
+    #pragma omp parallel for num_threads(num_threads)
+    for (j = 0; j < num_threads; j++) {
+      if (parent_i == j) {
+        timeout_barrier_wait(&barrier, 4);
+      }
+      vals[parent_i * num_threads + j] += 2;
+    }
+  }
+
+  for (i = 0; i < num_threads * num_threads; i++) {
+    if (vals[i] != 3) {
+      printf("vals[%d] == %d\n", i, vals[i]);
+      return 0;
+    }
+  }
+
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 1; i < 3; i++) {
+    if (!test_thread_thread_scheduling(i * 4)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/threadid/task_tied_thread_threadid.c b/runtime/test/bolt/threadid/task_tied_thread_threadid.c
new file mode 100644
index 000000000..46ae309a6
--- /dev/null
+++ b/runtime/test/bolt/threadid/task_tied_thread_threadid.c
@@ -0,0 +1,103 @@
+// RUN: %libomp-compile-and-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include <string.h>
+#include <stdio.h>
+
+int test_task_tied_thread_threadid(int num_threads) {
+  int vals[num_threads];
+  memset(vals, 0, sizeof(int) * num_threads);
+  omp_set_max_active_levels(2);
+
+  #pragma omp parallel num_threads(num_threads / 2 + 1)
+  #pragma omp master
+  {
+    int i;
+    for (i = 0; i < num_threads; i++) {
+      #pragma omp task firstprivate(i)
+      {
+        int omp_thread_id = omp_get_thread_num();
+        ABT_thread abt_thread;
+        ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread));
+
+        int local_vals[num_threads];
+        memset(local_vals, 0, sizeof(int) * num_threads);
+
+        int j;
+        #pragma omp parallel for num_threads(num_threads)
+        for (j = 0; j < num_threads; j++) {
+          int l2_omp_thread_id = omp_get_thread_num();
+          ABT_thread l2_abt_thread;
+          ABT_EXIT_IF_FAIL(ABT_thread_self(&l2_abt_thread));
+
+          // Context switching in OpenMP.
+          #pragma omp taskyield
+
+          int l2_omp_thread_id2 = omp_get_thread_num();
+          if (l2_omp_thread_id == l2_omp_thread_id2) {
+            local_vals[j] += 1;
+          }
+          ABT_thread l2_abt_thread2;
+          ABT_EXIT_IF_FAIL(ABT_thread_self(&l2_abt_thread2));
+          ABT_bool l2_abt_thread_equal;
+          ABT_EXIT_IF_FAIL(ABT_thread_equal(l2_abt_thread, l2_abt_thread2,
+                                            &l2_abt_thread_equal));
+          if (l2_abt_thread_equal == ABT_TRUE) {
+            local_vals[j] += 2;
+          }
+
+          // Context switching in Argobots.
+          ABT_EXIT_IF_FAIL(ABT_thread_yield());
+
+          int l2_omp_thread_id3 = omp_get_thread_num();
+          if (l2_omp_thread_id2 == l2_omp_thread_id3) {
+            local_vals[j] += 4;
+          }
+        }
+
+        // Check child threads.
+        int child_fail = 0;
+        for (j = 0; j < num_threads; j++) {
+          if (local_vals[i] != 7) {
+            child_fail = 1;
+          }
+        }
+        if (!child_fail) {
+          vals[i] += 1;
+        }
+
+        int omp_thread_id2 = omp_get_thread_num();
+        if (omp_thread_id == omp_thread_id2) {
+          vals[i] += 2;
+        }
+        ABT_thread abt_thread2;
+        ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread2));
+        ABT_bool abt_thread_equal;
+        ABT_EXIT_IF_FAIL(ABT_thread_equal(abt_thread, abt_thread2,
+                                          &abt_thread_equal));
+        if (abt_thread_equal == ABT_TRUE) {
+          vals[i] += 4;
+        }
+      }
+    }
+  }
+
+  int index;
+  for (index = 0; index < num_threads; index++) {
+    if (vals[index] != 7) {
+      printf("vals[%d] == %d\n", index, vals[index]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_task_tied_thread_threadid(i + 1)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/threadid/task_tied_threadid.c b/runtime/test/bolt/threadid/task_tied_threadid.c
new file mode 100644
index 000000000..4b3cd685d
--- /dev/null
+++ b/runtime/test/bolt/threadid/task_tied_threadid.c
@@ -0,0 +1,67 @@
+// RUN: %libomp-compile-and-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include <string.h>
+#include <stdio.h>
+
+int test_task_tied_threadid(int num_threads) {
+  int i, vals[NUM_TASKS];
+  memset(vals, 0, sizeof(vals));
+
+  #pragma omp parallel num_threads(num_threads)
+  {
+    #pragma omp master
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        #pragma omp task firstprivate(i)
+        {
+          int omp_thread_id = omp_get_thread_num();
+          ABT_thread abt_thread;
+          ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread));
+
+          // Context switching in OpenMP.
+          #pragma omp taskyield
+
+          int omp_thread_id2 = omp_get_thread_num();
+          if (omp_thread_id == omp_thread_id2) {
+            vals[i] += 1;
+          }
+          ABT_thread abt_thread2;
+          ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread2));
+          ABT_bool abt_thread_equal;
+          ABT_EXIT_IF_FAIL(ABT_thread_equal(abt_thread, abt_thread2,
+                                            &abt_thread_equal));
+          if (abt_thread_equal == ABT_TRUE) {
+            vals[i] += 2;
+          }
+
+          // Context switching in Argobots.
+          ABT_EXIT_IF_FAIL(ABT_thread_yield());
+
+          int omp_thread_id3 = omp_get_thread_num();
+          if (omp_thread_id2 == omp_thread_id3) {
+            vals[i] += 4;
+          }
+        }
+      }
+    }
+  }
+
+  for (i = 0; i < NUM_TASKS; i++) {
+    if (vals[i] != 7) {
+      printf("vals[%d] == %d\n", i, vals[i]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_task_tied_threadid(i + 1)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/threadid/task_unitied_thread_threadid.c b/runtime/test/bolt/threadid/task_unitied_thread_threadid.c
new file mode 100644
index 000000000..06f4b1fc2
--- /dev/null
+++ b/runtime/test/bolt/threadid/task_unitied_thread_threadid.c
@@ -0,0 +1,101 @@
+// RUN: %libomp-compile-and-run
+// REQUIRES: abt
+
+// Compilation error after clang11+
+// UNSUPPORTED: clang
+#include "omp_testsuite.h"
+#include <string.h>
+#include <stdio.h>
+
+int test_task_untied_thread_threadid(int num_threads) {
+  int vals[num_threads];
+  memset(vals, 0, sizeof(int) * num_threads);
+  omp_set_max_active_levels(2);
+
+  #pragma omp parallel num_threads(num_threads / 2 + 1)
+  #pragma omp master
+  {
+    int i;
+    for (i = 0; i < num_threads; i++) {
+      #pragma omp task firstprivate(i) untied
+      {
+        ABT_thread abt_thread;
+        ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread));
+
+        int local_vals[num_threads];
+        memset(local_vals, 0, sizeof(int) * num_threads);
+
+        int j;
+        #pragma omp parallel for num_threads(num_threads)
+        for (j = 0; j < num_threads; j++) {
+          int l2_omp_thread_id = omp_get_thread_num();
+          ABT_thread l2_abt_thread;
+          ABT_EXIT_IF_FAIL(ABT_thread_self(&l2_abt_thread));
+
+          // Context switching in OpenMP.
+          #pragma omp taskyield
+
+          int l2_omp_thread_id2 = omp_get_thread_num();
+          if (l2_omp_thread_id == l2_omp_thread_id2) {
+            local_vals[j] += 1;
+          }
+          ABT_thread l2_abt_thread2;
+          ABT_EXIT_IF_FAIL(ABT_thread_self(&l2_abt_thread2));
+          ABT_bool l2_abt_thread_equal;
+          ABT_EXIT_IF_FAIL(ABT_thread_equal(l2_abt_thread, l2_abt_thread2,
+                                            &l2_abt_thread_equal));
+          if (l2_abt_thread_equal == ABT_TRUE) {
+            local_vals[j] += 2;
+          }
+
+          // Context switching in Argobots.
+          ABT_EXIT_IF_FAIL(ABT_thread_yield());
+
+          int l2_omp_thread_id3 = omp_get_thread_num();
+          if (l2_omp_thread_id2 == l2_omp_thread_id3) {
+            local_vals[j] += 4;
+          }
+        }
+
+        // Check child threads.
+        int child_fail = 0;
+        for (j = 0; j < num_threads; j++) {
+          if (local_vals[i] != 7) {
+            child_fail = 1;
+          }
+        }
+        if (!child_fail) {
+          vals[i] += 1;
+        }
+
+        ABT_thread abt_thread2;
+        ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread2));
+        ABT_bool abt_thread_equal;
+        ABT_EXIT_IF_FAIL(ABT_thread_equal(abt_thread, abt_thread2,
+                                          &abt_thread_equal));
+        if (abt_thread_equal == ABT_TRUE) {
+          vals[i] += 2;
+        }
+      }
+    }
+  }
+
+  int index;
+  for (index = 0; index < num_threads; index++) {
+    if (vals[index] != 3) {
+      printf("vals[%d] == %d\n", index, vals[index]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_task_untied_thread_threadid(i + 1)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/threadid/task_untied_threadid.c b/runtime/test/bolt/threadid/task_untied_threadid.c
new file mode 100644
index 000000000..53336537f
--- /dev/null
+++ b/runtime/test/bolt/threadid/task_untied_threadid.c
@@ -0,0 +1,67 @@
+// RUN: %libomp-compile-and-run
+// REQUIRES: abt && !clang
+
+// Clang 10.0 discards local variables saved before taskyield.  We mark untied
+// task tests that use local variables with Clang as unsupported so far.
+#include "omp_testsuite.h"
+#include <string.h>
+#include <stdio.h>
+
+int test_task_untied_threadid(int num_threads) {
+  int i, vals[NUM_TASKS];
+  memset(vals, 0, sizeof(vals));
+
+  #pragma omp parallel num_threads(num_threads)
+  {
+    #pragma omp master
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        #pragma omp task firstprivate(i) untied
+        {
+          ABT_thread abt_thread;
+          ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread));
+
+          // Context switching in OpenMP.
+          #pragma omp taskyield
+
+          int omp_thread_id2 = omp_get_thread_num();
+          ABT_thread abt_thread2;
+          ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread2));
+          ABT_bool abt_thread_equal;
+          ABT_EXIT_IF_FAIL(ABT_thread_equal(abt_thread, abt_thread2,
+                                            &abt_thread_equal));
+          if (abt_thread_equal == ABT_TRUE) {
+            vals[i] += 1;
+          }
+
+          // Context switching in Argobots.
+          ABT_EXIT_IF_FAIL(ABT_thread_yield());
+
+          int omp_thread_id3 = omp_get_thread_num();
+          if (omp_thread_id2 == omp_thread_id3) {
+            // Argobots context switch does not change the thread-task mapping.
+            vals[i] += 2;
+          }
+        }
+      }
+    }
+  }
+
+  for (i = 0; i < NUM_TASKS; i++) {
+    if (vals[i] != 3) {
+      printf("vals[%d] == %d\n", i, vals[i]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_task_untied_threadid(i + 1)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/threadid/task_untied_threadid2.c b/runtime/test/bolt/threadid/task_untied_threadid2.c
new file mode 100644
index 000000000..eb20e210e
--- /dev/null
+++ b/runtime/test/bolt/threadid/task_untied_threadid2.c
@@ -0,0 +1,68 @@
+// RUN: %libomp-compile-and-run
+// REQUIRES: abt
+
+// Compilation error after clang11+
+// UNSUPPORTED: clang
+#include "omp_testsuite.h"
+#include <string.h>
+#include <stdio.h>
+
+int test_task_untied_threadid2(int num_threads) {
+  int i, vals[NUM_TASKS];
+  ABT_thread abt_threads[NUM_TASKS];
+  memset(vals, 0, sizeof(vals));
+
+  #pragma omp parallel num_threads(num_threads)
+  {
+    #pragma omp master
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        #pragma omp task firstprivate(i) untied
+        {
+          ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_threads[i]));
+
+          // Context switching in OpenMP.
+          #pragma omp taskyield
+
+          int omp_thread_id2 = omp_get_thread_num();
+          ABT_thread abt_thread = abt_threads[i];
+          ABT_thread abt_thread2;
+          ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread2));
+          ABT_bool abt_thread_equal;
+          ABT_EXIT_IF_FAIL(ABT_thread_equal(abt_thread, abt_thread2,
+                                            &abt_thread_equal));
+          if (abt_thread_equal == ABT_TRUE) {
+            vals[i] += 1;
+          }
+
+          // Context switching in Argobots.
+          ABT_EXIT_IF_FAIL(ABT_thread_yield());
+
+          int omp_thread_id3 = omp_get_thread_num();
+          if (omp_thread_id2 == omp_thread_id3) {
+            // Argobots context switch does not change the thread-task mapping.
+            vals[i] += 2;
+          }
+        }
+      }
+    }
+  }
+
+  for (i = 0; i < NUM_TASKS; i++) {
+    if (vals[i] != 3) {
+      printf("vals[%d] == %d\n", i, vals[i]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_task_untied_threadid2(i + 1)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/threadid/taskdep_tied_threadid.c b/runtime/test/bolt/threadid/taskdep_tied_threadid.c
new file mode 100644
index 000000000..a7f67ba1d
--- /dev/null
+++ b/runtime/test/bolt/threadid/taskdep_tied_threadid.c
@@ -0,0 +1,142 @@
+// RUN: %libomp-compile-and-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int calc_seq(int n) {
+  int i, j, *buffer = (int *)malloc(sizeof(int) * n * n);
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < n; j++) {
+      if (i == 0 && j == 0) {
+        buffer[i * n + j] = 1;
+      } else if (i == 0) {
+        buffer[i * n + j] = buffer[i * n + (j - 1)];
+      } else if (j == 0) {
+        buffer[i * n + j] = buffer[(i - 1) * n + j];
+      } else {
+        buffer[i * n + j] = buffer[(i - 1) * n + j] + buffer[i * n + (j - 1)];
+      }
+    }
+  }
+  int ret = buffer[(n - 1) * n + (n - 1)];
+  free(buffer);
+  return ret;
+}
+
+#define TASK_TIED_CHECK(_val_index)                                            \
+  do {                                                                         \
+    int val_index = (_val_index);                                              \
+    int omp_thread_id = omp_get_thread_num();                                  \
+    ABT_thread abt_thread;                                                     \
+    ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread));                            \
+                                                                               \
+    _Pragma("omp taskyield")                                                   \
+                                                                               \
+    int omp_thread_id2 = omp_get_thread_num();                                 \
+    if (omp_thread_id == omp_thread_id2) {                                     \
+      vals[val_index] += 1;                                                    \
+    }                                                                          \
+    ABT_thread abt_thread2;                                                    \
+    ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread2));                           \
+    ABT_bool abt_thread_equal;                                                 \
+    ABT_EXIT_IF_FAIL(ABT_thread_equal(abt_thread, abt_thread2,                 \
+                                      &abt_thread_equal));                     \
+    if (abt_thread_equal == ABT_TRUE) {                                        \
+      vals[val_index] += 2;                                                    \
+    }                                                                          \
+                                                                               \
+    ABT_EXIT_IF_FAIL(ABT_thread_yield());                                      \
+                                                                               \
+    int omp_thread_id3 = omp_get_thread_num();                                 \
+    if (omp_thread_id2 == omp_thread_id3) {                                    \
+      vals[val_index] += 4;                                                    \
+    }                                                                          \
+  } while (0)
+
+int test_taskdep_tied_threadid(int num_threads) {
+  int n = 10;
+  int seq_val, task_val;
+
+  int vals[n * n];
+  memset(vals, 0, sizeof(int) * n * n);
+
+  #pragma omp parallel shared(task_val) firstprivate(n) num_threads(num_threads)
+  #pragma omp master
+  {
+    int i, j;
+    int *A_buf = (int *)malloc(sizeof(int) * n * n);
+    int **A = (int **)malloc(sizeof(int *) * n);
+    for(i = 0; i < n; i++) {
+      A[i] = A_buf + (i * n);
+      for(j = 0; j < n; j++) {
+        // Assign random values.
+        A[i][j] = i * n + j;
+      }
+    }
+    // A[i][j] is the root task.
+    for(i = 0; i < n; i++) {
+      for(j = 0; j < n; j++) {
+        if (i == 0 && j == 0) {
+          #pragma omp task depend(out:A[i][j]) firstprivate(A, i, j)
+          {
+            TASK_TIED_CHECK(i * n + j);
+            A[i][j] = 1;
+          }
+        } else if (i == 0) {
+          #pragma omp task depend(in:A[i][j - 1]) depend(out:A[i][j]) \
+                           firstprivate(A, i, j)
+          {
+            TASK_TIED_CHECK(i * n + j);
+            A[i][j] = A[i][j - 1];
+          }
+        } else if (j == 0) {
+          #pragma omp task depend(in:A[i - 1][j]) depend(out:A[i][j]) \
+                           firstprivate(A, i, j)
+          {
+            TASK_TIED_CHECK(i * n + j);
+            A[i][j] = A[i - 1][j];
+          }
+        } else {
+          #pragma omp task depend(in:A[i - 1][j], A[i][j - 1]) \
+                           depend(out:A[i][j])
+          {
+            TASK_TIED_CHECK(i * n + j);
+            A[i][j] = A[i - 1][j] + A[i][j - 1];
+          }
+        }
+      }
+    }
+    #pragma omp taskwait
+    task_val = A[n - 1][n - 1];
+    free(A);
+    free(A_buf);
+  }
+
+  seq_val = calc_seq(n);
+  if(seq_val != task_val) {
+    printf("[%d] Failed: route(%d) = %d (ANS = %d)\n", num_threads, n, task_val,
+           seq_val);
+    return 0;
+  }
+  int index;
+  for (index = 0; index < n * n; index++) {
+    if (vals[index] != 7) {
+      printf("vals[%d] == %d\n", index, vals[index]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i;
+  int num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_taskdep_tied_threadid(i + 1)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/threadid/taskdep_untied_threadid.c b/runtime/test/bolt/threadid/taskdep_untied_threadid.c
new file mode 100644
index 000000000..16b601a4d
--- /dev/null
+++ b/runtime/test/bolt/threadid/taskdep_untied_threadid.c
@@ -0,0 +1,142 @@
+// RUN: %libomp-compile-and-run
+// REQUIRES: abt && !clang
+
+// Clang 10.0 discards local variables saved before taskyield.  We mark untied
+// task tests that use local variables across taskyield with Clang as
+// unsupported so far.
+#include "omp_testsuite.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int calc_seq(int n) {
+  int i, j, *buffer = (int *)malloc(sizeof(int) * n * n);
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < n; j++) {
+      if (i == 0 && j == 0) {
+        buffer[i * n + j] = 1;
+      } else if (i == 0) {
+        buffer[i * n + j] = buffer[i * n + (j - 1)];
+      } else if (j == 0) {
+        buffer[i * n + j] = buffer[(i - 1) * n + j];
+      } else {
+        buffer[i * n + j] = buffer[(i - 1) * n + j] + buffer[i * n + (j - 1)];
+      }
+    }
+  }
+  int ret = buffer[(n - 1) * n + (n - 1)];
+  free(buffer);
+  return ret;
+}
+
+#define TASK_UNTIED_CHECK(_val_index)                                          \
+  do {                                                                         \
+    int val_index = (_val_index);                                              \
+    ABT_thread abt_thread;                                                     \
+    ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread));                            \
+                                                                               \
+    _Pragma("omp taskyield")                                                   \
+                                                                               \
+    int omp_thread_id2 = omp_get_thread_num();                                 \
+    ABT_thread abt_thread2;                                                    \
+    ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread2));                           \
+    ABT_bool abt_thread_equal;                                                 \
+    ABT_EXIT_IF_FAIL(ABT_thread_equal(abt_thread, abt_thread2,                 \
+                                      &abt_thread_equal));                     \
+    if (abt_thread_equal == ABT_TRUE) {                                        \
+      vals[val_index] += 1;                                                    \
+    }                                                                          \
+                                                                               \
+    ABT_EXIT_IF_FAIL(ABT_thread_yield());                                      \
+                                                                               \
+    int omp_thread_id3 = omp_get_thread_num();                                 \
+    if (omp_thread_id2 == omp_thread_id3) {                                    \
+      vals[val_index] += 2;                                                    \
+    }                                                                          \
+  } while (0)
+
+int test_taskdep_untied_threadid(int num_threads) {
+  int n = 10;
+  int seq_val, task_val;
+
+  int vals[n * n];
+  memset(vals, 0, sizeof(int) * n * n);
+
+  #pragma omp parallel shared(task_val) firstprivate(n) num_threads(num_threads)
+  #pragma omp master
+  {
+    int i, j;
+    int *A_buf = (int *)malloc(sizeof(int) * n * n);
+    int **A = (int **)malloc(sizeof(int *) * n);
+    for(i = 0; i < n; i++) {
+      A[i] = A_buf + (i * n);
+      for(j = 0; j < n; j++) {
+        // Assign random values.
+        A[i][j] = i * n + j;
+      }
+    }
+    // A[i][j] is the root task.
+    for(i = 0; i < n; i++) {
+      for(j = 0; j < n; j++) {
+        if (i == 0 && j == 0) {
+          #pragma omp task depend(out:A[i][j]) firstprivate(A, i, j) untied
+          {
+            TASK_UNTIED_CHECK(i * n + j);
+            A[i][j] = 1;
+          }
+        } else if (i == 0) {
+          #pragma omp task depend(in:A[i][j - 1]) depend(out:A[i][j]) \
+                           firstprivate(A, i, j) untied
+          {
+            TASK_UNTIED_CHECK(i * n + j);
+            A[i][j] = A[i][j - 1];
+          }
+        } else if (j == 0) {
+          #pragma omp task depend(in:A[i - 1][j]) depend(out:A[i][j]) \
+                           firstprivate(A, i, j) untied
+          {
+            TASK_UNTIED_CHECK(i * n + j);
+            A[i][j] = A[i - 1][j];
+          }
+        } else {
+          #pragma omp task depend(in:A[i - 1][j], A[i][j - 1]) \
+                           depend(out:A[i][j]) untied
+          {
+            TASK_UNTIED_CHECK(i * n + j);
+            A[i][j] = A[i - 1][j] + A[i][j - 1];
+          }
+        }
+      }
+    }
+    #pragma omp taskwait
+    task_val = A[n - 1][n - 1];
+    free(A);
+    free(A_buf);
+  }
+
+  seq_val = calc_seq(n);
+  if(seq_val != task_val) {
+    printf("[%d] Failed: route(%d) = %d (ANS = %d)\n", num_threads, n, task_val,
+           seq_val);
+    return 0;
+  }
+  int index;
+  for (index = 0; index < n * n; index++) {
+    if (vals[index] != 3) {
+      printf("vals[%d] == %d\n", index, vals[index]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i;
+  int num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_taskdep_untied_threadid(i + 1)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/threadid/taskdep_untied_threadid2.c b/runtime/test/bolt/threadid/taskdep_untied_threadid2.c
new file mode 100644
index 000000000..77a1e37bc
--- /dev/null
+++ b/runtime/test/bolt/threadid/taskdep_untied_threadid2.c
@@ -0,0 +1,141 @@
+// RUN: %libomp-compile-and-run
+// REQUIRES: abt
+
+// Compilation error after clang11+
+// UNSUPPORTED: clang
+#include "omp_testsuite.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int calc_seq(int n) {
+  int i, j, *buffer = (int *)malloc(sizeof(int) * n * n);
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < n; j++) {
+      if (i == 0 && j == 0) {
+        buffer[i * n + j] = 1;
+      } else if (i == 0) {
+        buffer[i * n + j] = buffer[i * n + (j - 1)];
+      } else if (j == 0) {
+        buffer[i * n + j] = buffer[(i - 1) * n + j];
+      } else {
+        buffer[i * n + j] = buffer[(i - 1) * n + j] + buffer[i * n + (j - 1)];
+      }
+    }
+  }
+  int ret = buffer[(n - 1) * n + (n - 1)];
+  free(buffer);
+  return ret;
+}
+
+#define TASK_UNTIED_CHECK(val_index)                                           \
+  do {                                                                         \
+    ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_threads[(val_index)]));              \
+                                                                               \
+    _Pragma("omp taskyield")                                                   \
+                                                                               \
+    ABT_thread abt_thread = abt_threads[(val_index)];                          \
+    int omp_thread_id2 = omp_get_thread_num();                                 \
+    ABT_thread abt_thread2;                                                    \
+    ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread2));                           \
+    ABT_bool abt_thread_equal;                                                 \
+    ABT_EXIT_IF_FAIL(ABT_thread_equal(abt_thread, abt_thread2,                 \
+                                      &abt_thread_equal));                     \
+    if (abt_thread_equal == ABT_TRUE) {                                        \
+      vals[(val_index)] += 1;                                                  \
+    }                                                                          \
+                                                                               \
+    ABT_EXIT_IF_FAIL(ABT_thread_yield());                                      \
+                                                                               \
+    int omp_thread_id3 = omp_get_thread_num();                                 \
+    if (omp_thread_id2 == omp_thread_id3) {                                    \
+      vals[(val_index)] += 2;                                                  \
+    }                                                                          \
+  } while (0)
+
+int test_taskdep_untied_threadid2(int num_threads) {
+  int n = 10;
+  int seq_val, task_val;
+
+  int vals[n * n];
+  ABT_thread abt_threads[n * n];
+  memset(vals, 0, sizeof(int) * n * n);
+
+  #pragma omp parallel shared(task_val) firstprivate(n) num_threads(num_threads)
+  #pragma omp master
+  {
+    int i, j;
+    int *A_buf = (int *)malloc(sizeof(int) * n * n);
+    int **A = (int **)malloc(sizeof(int *) * n);
+    for(i = 0; i < n; i++) {
+      A[i] = A_buf + (i * n);
+      for(j = 0; j < n; j++) {
+        // Assign random values.
+        A[i][j] = i * n + j;
+      }
+    }
+    // A[i][j] is the root task.
+    for(i = 0; i < n; i++) {
+      for(j = 0; j < n; j++) {
+        if (i == 0 && j == 0) {
+          #pragma omp task depend(out:A[i][j]) firstprivate(A, i, j) untied
+          {
+            TASK_UNTIED_CHECK(i * n + j);
+            A[i][j] = 1;
+          }
+        } else if (i == 0) {
+          #pragma omp task depend(in:A[i][j - 1]) depend(out:A[i][j]) \
+                           firstprivate(A, i, j) untied
+          {
+            TASK_UNTIED_CHECK(i * n + j);
+            A[i][j] = A[i][j - 1];
+          }
+        } else if (j == 0) {
+          #pragma omp task depend(in:A[i - 1][j]) depend(out:A[i][j]) \
+                           firstprivate(A, i, j) untied
+          {
+            TASK_UNTIED_CHECK(i * n + j);
+            A[i][j] = A[i - 1][j];
+          }
+        } else {
+          #pragma omp task depend(in:A[i - 1][j], A[i][j - 1]) \
+                           depend(out:A[i][j]) untied
+          {
+            TASK_UNTIED_CHECK(i * n + j);
+            A[i][j] = A[i - 1][j] + A[i][j - 1];
+          }
+        }
+      }
+    }
+    #pragma omp taskwait
+    task_val = A[n - 1][n - 1];
+    free(A);
+    free(A_buf);
+  }
+
+  seq_val = calc_seq(n);
+  if(seq_val != task_val) {
+    printf("[%d] Failed: route(%d) = %d (ANS = %d)\n", num_threads, n, task_val,
+           seq_val);
+    return 0;
+  }
+  int index;
+  for (index = 0; index < n * n; index++) {
+    if (vals[index] != 3) {
+      printf("vals[%d] == %d\n", index, vals[index]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i;
+  int num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_taskdep_untied_threadid2(i + 1)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/threadid/taskloop_tied_threadid.c b/runtime/test/bolt/threadid/taskloop_tied_threadid.c
new file mode 100644
index 000000000..165d2ae98
--- /dev/null
+++ b/runtime/test/bolt/threadid/taskloop_tied_threadid.c
@@ -0,0 +1,67 @@
+// RUN: %libomp-compile-and-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include <string.h>
+#include <stdio.h>
+
+int test_taskloop_tied_threadid(int num_threads) {
+  int vals[NUM_TASKS];
+  memset(vals, 0, sizeof(vals));
+
+  #pragma omp parallel num_threads(num_threads)
+  {
+    #pragma omp master
+    {
+      int i;
+      #pragma omp taskloop grainsize(1)
+      for (i = 0; i < NUM_TASKS; i++) {
+        {
+          ABT_thread abt_thread;
+          ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread));
+
+          // Context switching in OpenMP.
+          #pragma omp taskyield
+
+          int omp_thread_id2 = omp_get_thread_num();
+          ABT_thread abt_thread2;
+          ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread2));
+          ABT_bool abt_thread_equal;
+          ABT_EXIT_IF_FAIL(ABT_thread_equal(abt_thread, abt_thread2,
+                                            &abt_thread_equal));
+          if (abt_thread_equal == ABT_TRUE) {
+            vals[i] += 1;
+          }
+
+          // Context switching in Argobots.
+          ABT_EXIT_IF_FAIL(ABT_thread_yield());
+
+          int omp_thread_id3 = omp_get_thread_num();
+          if (omp_thread_id2 == omp_thread_id3) {
+            // Argobots context switch does not change the thread-task mapping.
+            vals[i] += 2;
+          }
+        }
+      }
+    }
+  }
+
+  int index;
+  for (index = 0; index < NUM_TASKS; index++) {
+    if (vals[index] != 3) {
+      printf("vals[%d] == %d\n", index, vals[index]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i;
+  int num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_taskloop_tied_threadid(i + 1)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/threadid/taskloop_untied_threadid.c b/runtime/test/bolt/threadid/taskloop_untied_threadid.c
new file mode 100644
index 000000000..4793b98ce
--- /dev/null
+++ b/runtime/test/bolt/threadid/taskloop_untied_threadid.c
@@ -0,0 +1,70 @@
+// RUN: %libomp-compile-and-run
+// REQUIRES: abt && !clang
+
+// Clang 10.0 seems ignoring the taskloop's "untied" attribute.
+// We mark taskloop + untied with Clang as unsupported so far.
+#include "omp_testsuite.h"
+#include <string.h>
+#include <stdio.h>
+
+int test_taskloop_untied_threadid(int num_threads) {
+  int vals[NUM_TASKS];
+  memset(vals, 0, sizeof(vals));
+
+  #pragma omp parallel num_threads(num_threads)
+  {
+    #pragma omp master
+    {
+      int i;
+      #pragma omp taskloop grainsize(1) untied
+      for (i = 0; i < NUM_TASKS; i++) {
+        {
+          ABT_thread abt_thread;
+          ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread));
+
+          // Context switching in OpenMP.
+          #pragma omp taskyield
+
+          int omp_thread_id2 = omp_get_thread_num();
+          ABT_thread abt_thread2;
+          ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread2));
+          ABT_bool abt_thread_equal;
+          ABT_EXIT_IF_FAIL(ABT_thread_equal(abt_thread, abt_thread2,
+                                            &abt_thread_equal));
+          if (abt_thread_equal == ABT_TRUE) {
+            vals[i] += 1;
+          }
+
+          // Context switching in Argobots.
+          ABT_EXIT_IF_FAIL(ABT_thread_yield());
+
+          int omp_thread_id3 = omp_get_thread_num();
+          if (omp_thread_id2 == omp_thread_id3) {
+            // Argobots context switch does not change the thread-task mapping.
+            vals[i] += 2;
+          }
+        }
+      }
+    }
+  }
+
+  int index;
+  for (index = 0; index < NUM_TASKS; index++) {
+    if (vals[index] != 3) {
+      printf("vals[%d] == %d\n", index, vals[index]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i;
+  int num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_taskloop_untied_threadid(i + 1)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/threadid/thread_thread_threadid.c b/runtime/test/bolt/threadid/thread_thread_threadid.c
new file mode 100644
index 000000000..b82e740c7
--- /dev/null
+++ b/runtime/test/bolt/threadid/thread_thread_threadid.c
@@ -0,0 +1,95 @@
+// RUN: %libomp-compile-and-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include <string.h>
+#include <stdio.h>
+
+int test_thread_thread_threadid(int num_threads) {
+  int i, vals[num_threads];
+  memset(vals, 0, sizeof(int) * num_threads);
+  omp_set_max_active_levels(2);
+
+  #pragma omp parallel for num_threads(num_threads)
+  for (i = 0; i < num_threads; i++) {
+    int omp_thread_id = omp_get_thread_num();
+    ABT_thread abt_thread;
+    ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread));
+
+    int local_vals[num_threads];
+    memset(local_vals, 0, sizeof(int) * num_threads);
+
+    int j;
+    #pragma omp parallel for num_threads(num_threads)
+    for (j = 0; j < num_threads; j++) {
+      int l2_omp_thread_id = omp_get_thread_num();
+      ABT_thread l2_abt_thread;
+      ABT_EXIT_IF_FAIL(ABT_thread_self(&l2_abt_thread));
+
+      // Context switching in OpenMP.
+      #pragma omp taskyield
+
+      int l2_omp_thread_id2 = omp_get_thread_num();
+      if (l2_omp_thread_id == l2_omp_thread_id2) {
+        local_vals[j] += 1;
+      }
+      ABT_thread l2_abt_thread2;
+      ABT_EXIT_IF_FAIL(ABT_thread_self(&l2_abt_thread2));
+      ABT_bool l2_abt_thread_equal;
+      ABT_EXIT_IF_FAIL(ABT_thread_equal(l2_abt_thread, l2_abt_thread2,
+                                        &l2_abt_thread_equal));
+      if (l2_abt_thread_equal == ABT_TRUE) {
+        local_vals[j] += 2;
+      }
+
+      // Context switching in Argobots.
+      ABT_EXIT_IF_FAIL(ABT_thread_yield());
+
+      int l2_omp_thread_id3 = omp_get_thread_num();
+      if (l2_omp_thread_id2 == l2_omp_thread_id3) {
+        local_vals[j] += 4;
+      }
+    }
+
+    // Check child threads.
+    int child_fail = 0;
+    for (j = 0; j < num_threads; j++) {
+      if (local_vals[i] != 7) {
+        child_fail = 1;
+      }
+    }
+    if (!child_fail) {
+      vals[i] += 1;
+    }
+
+    int omp_thread_id2 = omp_get_thread_num();
+    if (omp_thread_id == omp_thread_id2) {
+      vals[i] += 2;
+    }
+    ABT_thread abt_thread2;
+    ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread2));
+    ABT_bool abt_thread_equal;
+    ABT_EXIT_IF_FAIL(ABT_thread_equal(abt_thread, abt_thread2,
+                                      &abt_thread_equal));
+    if (abt_thread_equal == ABT_TRUE) {
+      vals[i] += 4;
+    }
+  }
+
+  for (i = 0; i < num_threads; i++) {
+    if (vals[i] != 7) {
+      printf("vals[%d] == %d\n", i, vals[i]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_thread_thread_threadid(i + 1)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/bolt/threadid/thread_threadid.c b/runtime/test/bolt/threadid/thread_threadid.c
new file mode 100644
index 000000000..4af34df3c
--- /dev/null
+++ b/runtime/test/bolt/threadid/thread_threadid.c
@@ -0,0 +1,61 @@
+// RUN: %libomp-compile-and-run
+// REQUIRES: abt
+#include "omp_testsuite.h"
+#include <string.h>
+#include <stdio.h>
+
+int test_thread_threadid(int num_threads) {
+  int i, vals[num_threads];
+  memset(vals, 0, sizeof(int) * num_threads);
+
+  #pragma omp parallel for num_threads(num_threads)
+  for (i = 0; i < num_threads; i++) {
+    int omp_thread_id = omp_get_thread_num();
+    ABT_thread abt_thread;
+    ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread));
+
+    // Context switching in OpenMP.
+    #pragma omp taskyield
+
+    int omp_thread_id2 = omp_get_thread_num();
+    if (omp_thread_id == omp_thread_id2) {
+      vals[i] += 1;
+    }
+    ABT_thread abt_thread2;
+    ABT_EXIT_IF_FAIL(ABT_thread_self(&abt_thread2));
+    ABT_bool abt_thread_equal;
+    ABT_EXIT_IF_FAIL(ABT_thread_equal(abt_thread, abt_thread2,
+                                      &abt_thread_equal));
+    if (abt_thread_equal == ABT_TRUE) {
+      vals[i] += 2;
+    }
+
+
+    // Context switching in Argobots.
+    ABT_EXIT_IF_FAIL(ABT_thread_yield());
+
+    int omp_thread_id3 = omp_get_thread_num();
+    if (omp_thread_id2 == omp_thread_id3) {
+      // Argobots context switch does not change the underlying thread.
+      vals[i] += 4;
+    }
+  }
+
+  for (i = 0; i < num_threads; i++) {
+    if (vals[i] != 7) {
+      printf("vals[%d] == %d\n", i, vals[i]);
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i, num_failed = 0;
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_thread_threadid(i + 1)) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/env/kmp_set_dispatch_buf.c b/runtime/test/env/kmp_set_dispatch_buf.c
index 49eb7b54f..264510d55 100644
--- a/runtime/test/env/kmp_set_dispatch_buf.c
+++ b/runtime/test/env/kmp_set_dispatch_buf.c
@@ -1,9 +1,15 @@
-// RUN: %libomp-compile && env KMP_DISP_NUM_BUFFERS=0 %libomp-run
-// RUN: env KMP_DISP_NUM_BUFFERS=1 %libomp-run && env KMP_DISP_NUM_BUFFERS=3 %libomp-run
-// RUN: env KMP_DISP_NUM_BUFFERS=4 %libomp-run && env KMP_DISP_NUM_BUFFERS=7 %libomp-run
-// RUN: %libomp-compile -DMY_SCHEDULE=guided && env KMP_DISP_NUM_BUFFERS=1 %libomp-run
-// RUN: env KMP_DISP_NUM_BUFFERS=3 %libomp-run && env KMP_DISP_NUM_BUFFERS=4 %libomp-run
+// RUN: %libomp-compile
+// RUN: env KMP_DISP_NUM_BUFFERS=0 %libomp-run
+// RUN: env KMP_DISP_NUM_BUFFERS=1 %libomp-run
+// RUN: env KMP_DISP_NUM_BUFFERS=3 %libomp-run
+// RUN: env KMP_DISP_NUM_BUFFERS=4 %libomp-run
 // RUN: env KMP_DISP_NUM_BUFFERS=7 %libomp-run
+// RUN: %libomp-compile -DMY_SCHEDULE=guided
+// RUN: env KMP_DISP_NUM_BUFFERS=1 %libomp-run
+// RUN: env KMP_DISP_NUM_BUFFERS=3 %libomp-run
+// RUN: env KMP_DISP_NUM_BUFFERS=4 %libomp-run
+// RUN: env KMP_DISP_NUM_BUFFERS=7 %libomp-run
+// UNSUPPORTED: clang-11, clang-12
 #include <stdio.h>
 #include <omp.h>
 #include <stdlib.h>
diff --git a/runtime/test/env/omp_alloc_env_invalid.c b/runtime/test/env/omp_alloc_env_invalid.c
new file mode 100644
index 000000000..6b2e9d821
--- /dev/null
+++ b/runtime/test/env/omp_alloc_env_invalid.c
@@ -0,0 +1,16 @@
+// RUN: %libomp-compile
+// RUN: env OMP_ALLOCATOR=111 %libomp-run 2>&1 | FileCheck %s
+// RUN: env OMP_ALLOCATOR=omp_default_mem_alloc_xyz %libomp-run 2>&1 | FileCheck %s
+// UNSUPPORTED: gcc
+
+// Both invocations of the test should produce (different) warnings:
+// OMP: Warning #42: OMP_ALLOCATOR: "111" is an invalid value; ignored.
+// OMP: Warning #189: Allocator omp_const_mem_alloc is not available, will use default allocator.
+#include <stdio.h>
+#include <omp.h>
+int main() {
+  volatile int n = omp_get_max_threads(); // causes library initialization
+  return 0;
+}
+
+// CHECK: {{^OMP: Warning #[0-9]+}}: {{.*$}}
diff --git a/runtime/test/env/omp_target_offload.c b/runtime/test/env/omp_target_offload.c
index 91ce108b5..4771e3c75 100644
--- a/runtime/test/env/omp_target_offload.c
+++ b/runtime/test/env/omp_target_offload.c
@@ -9,6 +9,7 @@ enum kmp_target_offload_kind {
 };
 
 extern int __kmpc_get_target_offload();
+extern void kmp_set_defaults(char const *str);
 
 const char *disabled_examples[] = {
     // Allowed inputs
diff --git a/runtime/test/lit.cfg b/runtime/test/lit.cfg
index 22be385ab..92593e274 100644
--- a/runtime/test/lit.cfg
+++ b/runtime/test/lit.cfg
@@ -27,7 +27,7 @@ def append_dynamic_library_path(path):
         config.environment[name] = path
 
 # name: The name of this test suite.
-config.name = 'libomp'
+config.name = 'bolt-libomp'
 
 # suffixes: A list of file extensions to treat as test files.
 config.suffixes = ['.c', '.cpp']
@@ -65,6 +65,15 @@ if config.using_hwloc:
     append_dynamic_library_path(config.hwloc_library_dir)
     config.available_features.add('hwloc')
 
+# Setup BOLT flag
+if config.using_abt:
+    config.available_features.add('abt')
+    libs += " -labt"
+    # Some tasking tests require larger stack size.
+    config.environment['ABT_THREAD_STACKSIZE'] = "262144"
+    # Sleep alleviates oversubscription overheads when -j is specified.
+    config.environment['KMP_ABT_SCHED_SLEEP'] = "1"
+
 # Rpath modifications for Darwin
 if config.operating_system == 'Darwin':
     config.test_flags += " -Wl,-rpath," + config.library_dir
@@ -100,18 +109,26 @@ if config.operating_system == 'NetBSD':
 if config.operating_system in ['Linux', 'Windows']:
     config.available_features.add('affinity')
 
+import multiprocessing
+try:
+    if multiprocessing.cpu_count() > 1:
+        config.available_features.add('multicpu')
+except NotImplementedError:
+    pass
+
 # to run with icc INTEL_LICENSE_FILE must be set
 if 'INTEL_LICENSE_FILE' in os.environ:
     config.environment['INTEL_LICENSE_FILE'] = os.environ['INTEL_LICENSE_FILE']
 
-
 # substitutions
 config.substitutions.append(("%libomp-compile-and-run", \
     "%libomp-compile && %libomp-run"))
 config.substitutions.append(("%libomp-cxx-compile-and-run", \
     "%libomp-cxx-compile && %libomp-run"))
+config.substitutions.append(("%libomp-cxx-compile-c", \
+    "%clangXX %openmp_flags %flags -std=c++14 -x c++ %s -o %t" + libs))
 config.substitutions.append(("%libomp-cxx-compile", \
-    "%clangXX %openmp_flags %flags -std=c++11 %s -o %t" + libs))
+    "%clangXX %openmp_flags %flags -std=c++14 %s -o %t" + libs))
 config.substitutions.append(("%libomp-compile", \
     "%clang %openmp_flags %flags %s -o %t" + libs))
 config.substitutions.append(("%libomp-run", "%t"))
@@ -120,9 +137,9 @@ config.substitutions.append(("%clang", config.test_c_compiler))
 config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
 config.substitutions.append(("%flags", config.test_flags))
 config.substitutions.append(("%python", '"%s"' % (sys.executable)))
+config.substitutions.append(("%not", config.test_not))
 
 if config.has_ompt:
-    config.substitutions.append(("FileCheck", "tee %%t.out | %s" % config.test_filecheck))
     config.substitutions.append(("%sort-threads", "sort -n -s"))
     if config.operating_system == 'Windows':
         # No such environment variable on Windows.
@@ -135,3 +152,5 @@ if config.has_ompt:
     else:
         config.substitutions.append(("%preload-tool", "env LD_PRELOAD=%T/tool.so"))
         config.substitutions.append(("%no-as-needed-flag", "-Wl,--no-as-needed"))
+else:
+    config.substitutions.append(("FileCheck", config.test_filecheck))
diff --git a/runtime/test/lit.site.cfg.in b/runtime/test/lit.site.cfg.in
index c2825ee4e..a28eda7ef 100644
--- a/runtime/test/lit.site.cfg.in
+++ b/runtime/test/lit.site.cfg.in
@@ -4,15 +4,17 @@ config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
 config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
 config.test_compiler_features = @OPENMP_TEST_COMPILER_FEATURES@
 config.test_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
+config.test_not = "@OPENMP_NOT_EXECUTABLE@"
 config.test_openmp_flags = "@OPENMP_TEST_OPENMP_FLAGS@"
 config.test_extra_flags = "@OPENMP_TEST_FLAGS@"
 config.libomp_obj_root = "@CMAKE_CURRENT_BINARY_DIR@"
-config.library_dir = "@LIBOMP_LIBRARY_DIR@"
+config.library_dir = "@LIBBOLT_LIBRARY_DIR@"
 config.omp_header_directory = "@LIBOMP_BINARY_DIR@/src"
 config.operating_system = "@CMAKE_SYSTEM_NAME@"
 config.hwloc_library_dir = "@LIBOMP_HWLOC_LIBRARY_DIR@"
 config.using_hwloc = @LIBOMP_USE_HWLOC@
-config.has_ompt = @LIBOMP_OMPT_SUPPORT@ and @LIBOMP_OMPT_OPTIONAL@
+config.using_abt = @LIBOMP_TEST_USE_ARGOBOTS@
+config.has_ompt = @LIBBOLT_OMPT_SUPPORT@ and @LIBOMP_OMPT_OPTIONAL@
 config.has_libm = @LIBOMP_HAVE_LIBM@
 config.has_libatomic = @LIBOMP_HAVE_LIBATOMIC@
 
diff --git a/runtime/test/lock/omp_init_lock.c b/runtime/test/lock/omp_init_lock.c
index 24b60d1ab..db57738d9 100644
--- a/runtime/test/lock/omp_init_lock.c
+++ b/runtime/test/lock/omp_init_lock.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// REQUIRES: dummy
 #include "omp_testsuite.h"
 #include <stdio.h>
 
@@ -23,7 +24,7 @@ int test_omp_init_lock() {
       omp_unset_lock(&my_lcks[j % LOCKS_PER_ITER]);
     }
   }
-  // Wait until all repititions are done.  The test is exercising growth of
+  // Wait until all repetitions are done.  The test is exercising growth of
   // the global lock pool, which does not shrink when no locks are allocated.
   {
     int j;
diff --git a/runtime/test/misc_bugs/cancellation_for_sections.c b/runtime/test/misc_bugs/cancellation_for_sections.c
index 7cdaa1f3c..eba495648 100644
--- a/runtime/test/misc_bugs/cancellation_for_sections.c
+++ b/runtime/test/misc_bugs/cancellation_for_sections.c
@@ -1,6 +1,6 @@
 // RUN: %libomp-compile && env OMP_CANCELLATION=true %libomp-run
 // Clang had a bug until version 4.0.1 which resulted in a hang.
-// UNSUPPORTED: clang-3, clang-4.0.0
+// UNSUPPORTED: abt, clang-3, clang-4.0.0
 
 // Regression test for a bug in cancellation to cover effect of `#pragma omp cancel`
 // in a loop construct, on sections construct.
diff --git a/runtime/test/misc_bugs/for-task-for-task.c b/runtime/test/misc_bugs/for-task-for-task.c
new file mode 100644
index 000000000..9ff2764e2
--- /dev/null
+++ b/runtime/test/misc_bugs/for-task-for-task.c
@@ -0,0 +1,76 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+#define NUM_OUTER_THREADS 16
+#define NUM_INNER_THREADS 16
+#define SMALL_LOOPCOUNT   64
+
+/*! Utility function to spend some time in a loop */
+static void do_some_work (void) {
+  int i;
+  double sum = 0;
+  for(i = 0; i < 1000; i++) {
+    sum += sqrt(i);
+  }
+}
+
+int test_omp_parallel_for_task_for_task() {
+  int vals[SMALL_LOOPCOUNT];
+  int i;
+  for (i = 0; i < SMALL_LOOPCOUNT; i++) {
+    vals[i] = 0;
+  }
+  #pragma omp parallel firstprivate(vals) num_threads(NUM_OUTER_THREADS)
+  #pragma omp master
+  {
+    for (i = 1; i <= SMALL_LOOPCOUNT; i++) {
+      #pragma omp task firstprivate(i) firstprivate(vals)
+      {
+        #pragma omp parallel num_threads(NUM_INNER_THREADS) firstprivate(i)
+        #pragma omp master
+        {
+          int j;
+          for (j = 1; j <= SMALL_LOOPCOUNT; j++) {
+            #pragma omp task firstprivate(i)
+            {
+              int k;
+              do_some_work();
+              for (k = 0; k < j % 4; k++) {
+                #pragma omp taskyield
+              }
+              #pragma omp atomic
+              vals[i] += j;
+            }
+          }
+        }
+        {
+          int j;
+          for (j = 0; j < i % 5; j++) {
+            #pragma omp taskyield
+          }
+        }
+      }
+    }
+  }
+  int num_failed = 0;
+  int known_sum = SMALL_LOOPCOUNT * (SMALL_LOOPCOUNT + 1) / 2;
+  for (i = 0; i < SMALL_LOOPCOUNT; i++) {
+    if (vals[i] != known_sum)
+      num_failed++;
+  }
+  return num_failed ? 1 : 0;
+}
+
+int main() {
+  int i;
+  int num_failed = 0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_omp_parallel_for_task_for_task()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/misc_bugs/for-task-for.c b/runtime/test/misc_bugs/for-task-for.c
new file mode 100644
index 000000000..30f9f6244
--- /dev/null
+++ b/runtime/test/misc_bugs/for-task-for.c
@@ -0,0 +1,69 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+#define NUM_OUTER_THREADS 16
+#define NUM_INNER_THREADS 16
+#define SMALL_LOOPCOUNT   64
+
+/*! Utility function to spend some time in a loop */
+static void do_some_work (void) {
+  int i;
+  double sum = 0;
+  for(i = 0; i < 1000; i++) {
+    sum += sqrt(i);
+  }
+}
+
+int test_omp_parallel_for_task_for() {
+  int vals[SMALL_LOOPCOUNT];
+  int i;
+  for (i = 0; i < SMALL_LOOPCOUNT; i++) {
+    vals[i] = 0;
+  }
+  #pragma omp parallel firstprivate(vals) num_threads(NUM_OUTER_THREADS)
+  #pragma omp master
+  {
+    for (i = 1; i <= SMALL_LOOPCOUNT; i++) {
+      #pragma omp task firstprivate(i) firstprivate(vals)
+      {
+        int local_sum = 0;
+        int j;
+        #pragma omp parallel for reduction(+:local_sum) \
+                num_threads(NUM_INNER_THREADS)
+        for (j = 1; j <= SMALL_LOOPCOUNT; j++) {
+          int k;
+          do_some_work();
+          for (k = 0; k < j % 4; k++) {
+            #pragma omp taskyield
+          }
+          local_sum += j;
+        }
+        for (j = 0; j < i % 5; j++) {
+          #pragma omp taskyield
+        }
+        vals[i] = local_sum;
+      }
+    }
+  }
+  int num_failed = 0;
+  int known_sum = SMALL_LOOPCOUNT * (SMALL_LOOPCOUNT + 1) / 2;
+  for (i = 0; i < SMALL_LOOPCOUNT; i++) {
+    if (vals[i] != known_sum)
+      num_failed++;
+  }
+  return num_failed ? 1 : 0;
+}
+
+int main() {
+  int i;
+  int num_failed = 0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_omp_parallel_for_task_for()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/misc_bugs/omp_foreign_thread_team_reuse.c b/runtime/test/misc_bugs/omp_foreign_thread_team_reuse.c
index a8400e454..f1e5f1bd0 100644
--- a/runtime/test/misc_bugs/omp_foreign_thread_team_reuse.c
+++ b/runtime/test/misc_bugs/omp_foreign_thread_team_reuse.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// REQUIRES: !abt
 #include <stdio.h>
 #include "omp_testsuite.h"
 
@@ -37,6 +38,7 @@ void* thread_function(void* arg) {
       a++;
     }
   }
+  return NULL;
 }
 
 int test_omp_team_reuse()
diff --git a/runtime/test/misc_bugs/stack-propagate.c b/runtime/test/misc_bugs/stack-propagate.c
index ac289b56c..8d21d9c36 100644
--- a/runtime/test/misc_bugs/stack-propagate.c
+++ b/runtime/test/misc_bugs/stack-propagate.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// UNSUPPORTED: abt
 
 // https://bugs.llvm.org/show_bug.cgi?id=26540 requested
 // stack size to be propagated from master to workers.
diff --git a/runtime/test/omp_testsuite.h b/runtime/test/omp_testsuite.h
index eef547097..bd519a483 100644
--- a/runtime/test/omp_testsuite.h
+++ b/runtime/test/omp_testsuite.h
@@ -76,4 +76,21 @@ static int pthread_join(pthread_t thread, void **retval) {
 # include <pthread.h>
 #endif
 
+#ifdef BOLT_VERSION
+#  if BOLT_THREAD == BOLT_THREAD_ARGOBOTS
+#    include <abt.h>
+#    define ABT_EXIT_IF_FAIL(_abt_call)                                        \
+        do {                                                                   \
+          int _abt_ret = (_abt_call);                                          \
+          if (_abt_ret != ABT_SUCCESS)                                         \
+            exit(1);                                                           \
+        } while (0)
+#    define THREAD_SCHED_POINT() ABT_EXIT_IF_FAIL(ABT_thread_yield())
+#  else
+#    define THREAD_SCHED_POINT() do {;} while(0)
+#  endif
+#else
+#  define THREAD_SCHED_POINT() do {;} while(0)
+#endif
+
 #endif
diff --git a/runtime/test/ompt/callback.h b/runtime/test/ompt/callback.h
old mode 100755
new mode 100644
index 64463ec83..2bc26b51d
--- a/runtime/test/ompt/callback.h
+++ b/runtime/test/ompt/callback.h
@@ -1,7 +1,9 @@
 #ifndef _BSD_SOURCE
 #define _BSD_SOURCE
 #endif
+#ifndef _DEFAULT_SOURCE
 #define _DEFAULT_SOURCE
+#endif
 #include <stdio.h>
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
@@ -14,22 +16,25 @@
 // Used to detect architecture
 #include "../../src/kmp_platform.h"
 
-static const char* ompt_thread_t_values[] = {
-  NULL,
-  "ompt_thread_initial",
-  "ompt_thread_worker",
-  "ompt_thread_other"
-};
+#ifndef _TOOL_PREFIX
+#define _TOOL_PREFIX ""
+// If no _TOOL_PREFIX is set, we assume that we run as part of an OMPT test
+#define _OMPT_TESTS
+#endif
 
-static const char* ompt_task_status_t_values[] = {
-  NULL,
-  "ompt_task_complete",       // 1
-  "ompt_task_yield",          // 2
-  "ompt_task_cancel",         // 3
-  "ompt_task_detach",         // 4
-  "ompt_task_early_fulfill",  // 5
-  "ompt_task_late_fulfill",   // 6
-  "ompt_task_switch"          // 7
+static const char *ompt_thread_t_values[] = {
+    "ompt_thread_UNDEFINED", "ompt_thread_initial", "ompt_thread_worker",
+    "ompt_thread_other"};
+
+static const char *ompt_task_status_t_values[] = {
+    "ompt_task_UNDEFINED",
+    "ompt_task_complete", // 1
+    "ompt_task_yield", // 2
+    "ompt_task_cancel", // 3
+    "ompt_task_detach", // 4
+    "ompt_task_early_fulfill", // 5
+    "ompt_task_late_fulfill", // 6
+    "ompt_task_switch" // 7
 };
 static const char* ompt_cancel_flag_t_values[] = {
   "ompt_cancel_parallel",
@@ -41,6 +46,17 @@ static const char* ompt_cancel_flag_t_values[] = {
   "ompt_cancel_discarded_task"
 };
 
+static const char *ompt_dependence_type_t_values[] = {
+    "ompt_dependence_type_UNDEFINED",
+    "ompt_dependence_type_in", // 1
+    "ompt_dependence_type_out", // 2
+    "ompt_dependence_type_inout", // 3
+    "ompt_dependence_type_mutexinoutset", // 4
+    "ompt_dependence_type_source", // 5
+    "ompt_dependence_type_sink", // 6
+    "ompt_dependence_type_inoutset" // 7
+};
+
 static void format_task_type(int type, char *buffer) {
   char *progress = buffer;
   if (type & ompt_task_initial)
@@ -149,7 +165,7 @@ ompt_label_##id:
   print_possible_return_addresses(get_ompt_label_address(id))
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
-// On X86 the NOP instruction is 1 byte long. In addition, the comiler inserts
+// On X86 the NOP instruction is 1 byte long. In addition, the compiler inserts
 // a MOV instruction for non-void runtime functions which is 3 bytes long.
 #define print_possible_return_addresses(addr) \
   printf("%" PRIu64 ": current_address=%p or %p for non-void functions\n", \
@@ -239,19 +255,34 @@ on_ompt_callback_mutex_acquire(
   switch(kind)
   {
     case ompt_mutex_lock:
-      printf("%" PRIu64 ": ompt_event_wait_lock: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_wait_lock: wait_id=%" PRIu64 ", hint=%" PRIu32
+             ", impl=%" PRIu32 ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
       break;
     case ompt_mutex_nest_lock:
-      printf("%" PRIu64 ": ompt_event_wait_nest_lock: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_wait_nest_lock: wait_id=%" PRIu64 ", hint=%" PRIu32
+             ", impl=%" PRIu32 ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
       break;
     case ompt_mutex_critical:
-      printf("%" PRIu64 ": ompt_event_wait_critical: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_wait_critical: wait_id=%" PRIu64 ", hint=%" PRIu32
+             ", impl=%" PRIu32 ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
       break;
     case ompt_mutex_atomic:
-      printf("%" PRIu64 ": ompt_event_wait_atomic: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_wait_atomic: wait_id=%" PRIu64 ", hint=%" PRIu32
+             ", impl=%" PRIu32 ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
       break;
     case ompt_mutex_ordered:
-      printf("%" PRIu64 ": ompt_event_wait_ordered: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_wait_ordered: wait_id=%" PRIu64 ", hint=%" PRIu32
+             ", impl=%" PRIu32 ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
       break;
     default:
       break;
@@ -267,19 +298,33 @@ on_ompt_callback_mutex_acquired(
   switch(kind)
   {
     case ompt_mutex_lock:
-      printf("%" PRIu64 ": ompt_event_acquired_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_acquired_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, codeptr_ra);
       break;
     case ompt_mutex_nest_lock:
-      printf("%" PRIu64 ": ompt_event_acquired_nest_lock_first: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_acquired_nest_lock_first: wait_id=%" PRIu64
+             ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, codeptr_ra);
       break;
     case ompt_mutex_critical:
-      printf("%" PRIu64 ": ompt_event_acquired_critical: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_acquired_critical: wait_id=%" PRIu64
+             ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, codeptr_ra);
       break;
     case ompt_mutex_atomic:
-      printf("%" PRIu64 ": ompt_event_acquired_atomic: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_acquired_atomic: wait_id=%" PRIu64
+             ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, codeptr_ra);
       break;
     case ompt_mutex_ordered:
-      printf("%" PRIu64 ": ompt_event_acquired_ordered: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_acquired_ordered: wait_id=%" PRIu64
+             ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, codeptr_ra);
       break;
     default:
       break;
@@ -295,19 +340,33 @@ on_ompt_callback_mutex_released(
   switch(kind)
   {
     case ompt_mutex_lock:
-      printf("%" PRIu64 ": ompt_event_release_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_release_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, codeptr_ra);
       break;
     case ompt_mutex_nest_lock:
-      printf("%" PRIu64 ": ompt_event_release_nest_lock_last: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_release_nest_lock_last: wait_id=%" PRIu64
+             ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, codeptr_ra);
       break;
     case ompt_mutex_critical:
-      printf("%" PRIu64 ": ompt_event_release_critical: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_release_critical: wait_id=%" PRIu64
+             ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, codeptr_ra);
       break;
     case ompt_mutex_atomic:
-      printf("%" PRIu64 ": ompt_event_release_atomic: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_release_atomic: wait_id=%" PRIu64
+             ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, codeptr_ra);
       break;
     case ompt_mutex_ordered:
-      printf("%" PRIu64 ": ompt_event_release_ordered: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_release_ordered: wait_id=%" PRIu64
+             ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, codeptr_ra);
       break;
     default:
       break;
@@ -323,11 +382,20 @@ on_ompt_callback_nest_lock(
   switch(endpoint)
   {
     case ompt_scope_begin:
-      printf("%" PRIu64 ": ompt_event_acquired_nest_lock_next: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_acquired_nest_lock_next: wait_id=%" PRIu64
+             ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, codeptr_ra);
       break;
     case ompt_scope_end:
-      printf("%" PRIu64 ": ompt_event_release_nest_lock_prev: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_release_nest_lock_prev: wait_id=%" PRIu64
+             ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, codeptr_ra);
       break;
+    case ompt_scope_beginend:
+      printf("ompt_scope_beginend should never be passed to %s\n", __func__);
+      exit(-1);
   }
 }
 
@@ -346,18 +414,36 @@ on_ompt_callback_sync_region(
       {
         case ompt_sync_region_barrier:
         case ompt_sync_region_barrier_implicit:
+        case ompt_sync_region_barrier_implicit_workshare:
+        case ompt_sync_region_barrier_implicit_parallel:
+        case ompt_sync_region_barrier_teams:
         case ompt_sync_region_barrier_explicit:
         case ompt_sync_region_barrier_implementation:
-          printf("%" PRIu64 ": ompt_event_barrier_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_barrier_begin: parallel_id=%" PRIu64
+                 ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra);
           print_ids(0);
           break;
         case ompt_sync_region_taskwait:
-          printf("%" PRIu64 ": ompt_event_taskwait_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_taskwait_begin: parallel_id=%" PRIu64
+                 ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra);
           break;
         case ompt_sync_region_taskgroup:
-          printf("%" PRIu64 ": ompt_event_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_taskgroup_begin: parallel_id=%" PRIu64
+                 ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra);
           break;
         case ompt_sync_region_reduction:
+          printf("ompt_sync_region_reduction should never be passed to "
+                 "on_ompt_callback_sync_region\n");
+          exit(-1);
           break;
       }
       break;
@@ -367,19 +453,43 @@ on_ompt_callback_sync_region(
         case ompt_sync_region_barrier:
         case ompt_sync_region_barrier_implicit:
         case ompt_sync_region_barrier_explicit:
+        case ompt_sync_region_barrier_implicit_workshare:
+        case ompt_sync_region_barrier_implicit_parallel:
+        case ompt_sync_region_barrier_teams:
         case ompt_sync_region_barrier_implementation:
-          printf("%" PRIu64 ": ompt_event_barrier_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_barrier_end: parallel_id=%" PRIu64
+                 ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+                 ompt_get_thread_data()->value,
+                 (parallel_data) ? parallel_data->value : 0, task_data->value,
+                 codeptr_ra);
           break;
         case ompt_sync_region_taskwait:
-          printf("%" PRIu64 ": ompt_event_taskwait_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_taskwait_end: parallel_id=%" PRIu64
+                 ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+                 ompt_get_thread_data()->value,
+                 (parallel_data) ? parallel_data->value : 0, task_data->value,
+                 codeptr_ra);
           break;
         case ompt_sync_region_taskgroup:
-          printf("%" PRIu64 ": ompt_event_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_taskgroup_end: parallel_id=%" PRIu64
+                 ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+                 ompt_get_thread_data()->value,
+                 (parallel_data) ? parallel_data->value : 0, task_data->value,
+                 codeptr_ra);
           break;
         case ompt_sync_region_reduction:
+          printf("ompt_sync_region_reduction should never be passed to "
+                 "on_ompt_callback_sync_region\n");
+          exit(-1);
           break;
       }
       break;
+    case ompt_scope_beginend:
+      printf("ompt_scope_beginend should never be passed to %s\n", __func__);
+      exit(-1);
   }
 }
 
@@ -398,17 +508,35 @@ on_ompt_callback_sync_region_wait(
       {
         case ompt_sync_region_barrier:
         case ompt_sync_region_barrier_implicit:
+        case ompt_sync_region_barrier_implicit_workshare:
+        case ompt_sync_region_barrier_implicit_parallel:
+        case ompt_sync_region_barrier_teams:
         case ompt_sync_region_barrier_explicit:
         case ompt_sync_region_barrier_implementation:
-          printf("%" PRIu64 ": ompt_event_wait_barrier_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_wait_barrier_begin: parallel_id=%" PRIu64
+                 ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra);
           break;
         case ompt_sync_region_taskwait:
-          printf("%" PRIu64 ": ompt_event_wait_taskwait_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_wait_taskwait_begin: parallel_id=%" PRIu64
+                 ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra);
           break;
         case ompt_sync_region_taskgroup:
-          printf("%" PRIu64 ": ompt_event_wait_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_wait_taskgroup_begin: parallel_id=%" PRIu64
+                 ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra);
           break;
         case ompt_sync_region_reduction:
+          printf("ompt_sync_region_reduction should never be passed to "
+                 "on_ompt_callback_sync_region_wait\n");
+          exit(-1);
           break;
       }
       break;
@@ -417,20 +545,72 @@ on_ompt_callback_sync_region_wait(
       {
         case ompt_sync_region_barrier:
         case ompt_sync_region_barrier_implicit:
+        case ompt_sync_region_barrier_implicit_workshare:
+        case ompt_sync_region_barrier_implicit_parallel:
+        case ompt_sync_region_barrier_teams:
         case ompt_sync_region_barrier_explicit:
         case ompt_sync_region_barrier_implementation:
-          printf("%" PRIu64 ": ompt_event_wait_barrier_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_wait_barrier_end: parallel_id=%" PRIu64
+                 ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+                 ompt_get_thread_data()->value,
+                 (parallel_data) ? parallel_data->value : 0, task_data->value,
+                 codeptr_ra);
           break;
         case ompt_sync_region_taskwait:
-          printf("%" PRIu64 ": ompt_event_wait_taskwait_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_wait_taskwait_end: parallel_id=%" PRIu64
+                 ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+                 ompt_get_thread_data()->value,
+                 (parallel_data) ? parallel_data->value : 0, task_data->value,
+                 codeptr_ra);
           break;
         case ompt_sync_region_taskgroup:
-          printf("%" PRIu64 ": ompt_event_wait_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_wait_taskgroup_end: parallel_id=%" PRIu64
+                 ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+                 ompt_get_thread_data()->value,
+                 (parallel_data) ? parallel_data->value : 0, task_data->value,
+                 codeptr_ra);
           break;
         case ompt_sync_region_reduction:
+          printf("ompt_sync_region_reduction should never be passed to "
+                 "on_ompt_callback_sync_region_wait\n");
+          exit(-1);
           break;
       }
       break;
+    case ompt_scope_beginend:
+      printf("ompt_scope_beginend should never be passed to %s\n", __func__);
+      exit(-1);
+  }
+}
+
+static void on_ompt_callback_reduction(ompt_sync_region_t kind,
+                                       ompt_scope_endpoint_t endpoint,
+                                       ompt_data_t *parallel_data,
+                                       ompt_data_t *task_data,
+                                       const void *codeptr_ra) {
+  switch (endpoint) {
+  case ompt_scope_begin:
+    printf("%" PRIu64 ":" _TOOL_PREFIX
+           " ompt_event_reduction_begin: parallel_id=%" PRIu64
+           ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+           ompt_get_thread_data()->value,
+           (parallel_data) ? parallel_data->value : 0, task_data->value,
+           codeptr_ra);
+    break;
+  case ompt_scope_end:
+    printf("%" PRIu64 ":" _TOOL_PREFIX
+           " ompt_event_reduction_end: parallel_id=%" PRIu64
+           ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+           ompt_get_thread_data()->value,
+           (parallel_data) ? parallel_data->value : 0, task_data->value,
+           codeptr_ra);
+    break;
+  case ompt_scope_beginend:
+    printf("ompt_scope_beginend should never be passed to %s\n", __func__);
+    exit(-1);
   }
 }
 
@@ -439,7 +619,8 @@ on_ompt_callback_flush(
     ompt_data_t *thread_data,
     const void *codeptr_ra)
 {
-  printf("%" PRIu64 ": ompt_event_flush: codeptr_ra=%p\n", thread_data->value, codeptr_ra);
+  printf("%" PRIu64 ":" _TOOL_PREFIX " ompt_event_flush: codeptr_ra=%p\n",
+         thread_data->value, codeptr_ra);
 }
 
 static void
@@ -466,7 +647,10 @@ on_ompt_callback_cancel(
   else if(flags & ompt_cancel_discarded_task)
     second_flag_value = ompt_cancel_flag_t_values[6];
 
-  printf("%" PRIu64 ": ompt_event_cancel: task_data=%" PRIu64 ", flags=%s|%s=%" PRIu32 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, task_data->value, first_flag_value, second_flag_value, flags,  codeptr_ra);
+  printf("%" PRIu64 ":" _TOOL_PREFIX " ompt_event_cancel: task_data=%" PRIu64
+         ", flags=%s|%s=%" PRIu32 ", codeptr_ra=%p\n",
+         ompt_get_thread_data()->value, task_data->value, first_flag_value,
+         second_flag_value, flags, codeptr_ra);
 }
 
 static void
@@ -496,24 +680,44 @@ on_ompt_callback_implicit_task(
         if (team_size == 1 && thread_num == 1 && parallel_data->ptr)
           printf("%s\n", "0: parallel_data initially not null");
         parallel_data->value = ompt_get_unique_id();
-        printf("%" PRIu64 ": ompt_event_initial_task_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", actual_parallelism=%" PRIu32 ", index=%" PRIu32 ", flags=%" PRIu32 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, team_size, thread_num, flags);
+        printf("%" PRIu64 ":" _TOOL_PREFIX
+               " ompt_event_initial_task_begin: parallel_id=%" PRIu64
+               ", task_id=%" PRIu64 ", actual_parallelism=%" PRIu32
+               ", index=%" PRIu32 ", flags=%" PRIu32 "\n",
+               ompt_get_thread_data()->value, parallel_data->value,
+               task_data->value, team_size, thread_num, flags);
       } else {
-        printf("%" PRIu64 ": ompt_event_implicit_task_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", team_size=%" PRIu32 ", thread_num=%" PRIu32 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, team_size, thread_num);
+        printf("%" PRIu64 ":" _TOOL_PREFIX
+               " ompt_event_implicit_task_begin: parallel_id=%" PRIu64
+               ", task_id=%" PRIu64 ", team_size=%" PRIu32
+               ", thread_num=%" PRIu32 "\n",
+               ompt_get_thread_data()->value, parallel_data->value,
+               task_data->value, team_size, thread_num);
       }
 
       break;
     case ompt_scope_end:
       if(flags & ompt_task_initial){
-        printf("%" PRIu64 ": ompt_event_initial_task_end: parallel_id=%" PRIu64
+        printf("%" PRIu64 ":" _TOOL_PREFIX
+               " ompt_event_initial_task_end: parallel_id=%" PRIu64
                ", task_id=%" PRIu64 ", actual_parallelism=%" PRIu32
                ", index=%" PRIu32 "\n",
                ompt_get_thread_data()->value,
                (parallel_data) ? parallel_data->value : 0, task_data->value,
                team_size, thread_num);
       } else {
-        printf("%" PRIu64 ": ompt_event_implicit_task_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", team_size=%" PRIu32 ", thread_num=%" PRIu32 "\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, team_size, thread_num);
+        printf("%" PRIu64 ":" _TOOL_PREFIX
+               " ompt_event_implicit_task_end: parallel_id=%" PRIu64
+               ", task_id=%" PRIu64 ", team_size=%" PRIu32
+               ", thread_num=%" PRIu32 "\n",
+               ompt_get_thread_data()->value,
+               (parallel_data) ? parallel_data->value : 0, task_data->value,
+               team_size, thread_num);
       }
       break;
+    case ompt_scope_beginend:
+      printf("ompt_scope_beginend should never be passed to %s\n", __func__);
+      exit(-1);
   }
 }
 
@@ -528,10 +732,16 @@ on_ompt_callback_lock_init(
   switch(kind)
   {
     case ompt_mutex_lock:
-      printf("%" PRIu64 ": ompt_event_init_lock: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_init_lock: wait_id=%" PRIu64 ", hint=%" PRIu32
+             ", impl=%" PRIu32 ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
       break;
     case ompt_mutex_nest_lock:
-      printf("%" PRIu64 ": ompt_event_init_nest_lock: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_init_nest_lock: wait_id=%" PRIu64 ", hint=%" PRIu32
+             ", impl=%" PRIu32 ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
       break;
     default:
       break;
@@ -547,10 +757,15 @@ on_ompt_callback_lock_destroy(
   switch(kind)
   {
     case ompt_mutex_lock:
-      printf("%" PRIu64 ": ompt_event_destroy_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_destroy_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, codeptr_ra);
       break;
     case ompt_mutex_nest_lock:
-      printf("%" PRIu64 ": ompt_event_destroy_nest_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_destroy_nest_lock: wait_id=%" PRIu64
+             ", codeptr_ra=%p \n",
+             ompt_get_thread_data()->value, wait_id, codeptr_ra);
       break;
     default:
       break;
@@ -572,26 +787,63 @@ on_ompt_callback_work(
       switch(wstype)
       {
         case ompt_work_loop:
-          printf("%" PRIu64 ": ompt_event_loop_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_loop_begin: parallel_id=%" PRIu64
+                 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64
+                 "\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra, count);
           break;
         case ompt_work_sections:
-          printf("%" PRIu64 ": ompt_event_sections_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_sections_begin: parallel_id=%" PRIu64
+                 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64
+                 "\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra, count);
           break;
         case ompt_work_single_executor:
-          printf("%" PRIu64 ": ompt_event_single_in_block_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_single_in_block_begin: parallel_id=%" PRIu64
+                 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64
+                 "\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra, count);
           break;
         case ompt_work_single_other:
-          printf("%" PRIu64 ": ompt_event_single_others_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_single_others_begin: parallel_id=%" PRIu64
+                 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra, count);
           break;
         case ompt_work_workshare:
           //impl
           break;
         case ompt_work_distribute:
-          printf("%" PRIu64 ": ompt_event_distribute_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_distribute_begin: parallel_id=%" PRIu64
+                 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64
+                 "\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra, count);
           break;
         case ompt_work_taskloop:
           //impl
-          printf("%" PRIu64 ": ompt_event_taskloop_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_taskloop_begin: parallel_id=%" PRIu64
+                 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64
+                 "\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_scope:
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_scope_begin: parallel_id=%" PRIu64
+                 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64
+                 "\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra, count);
           break;
       }
       break;
@@ -599,47 +851,92 @@ on_ompt_callback_work(
       switch(wstype)
       {
         case ompt_work_loop:
-          printf("%" PRIu64 ": ompt_event_loop_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_loop_end: parallel_id=%" PRIu64
+                 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra, count);
           break;
         case ompt_work_sections:
-          printf("%" PRIu64 ": ompt_event_sections_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_sections_end: parallel_id=%" PRIu64
+                 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra, count);
           break;
         case ompt_work_single_executor:
-          printf("%" PRIu64 ": ompt_event_single_in_block_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_single_in_block_end: parallel_id=%" PRIu64
+                 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra, count);
           break;
         case ompt_work_single_other:
-          printf("%" PRIu64 ": ompt_event_single_others_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_single_others_end: parallel_id=%" PRIu64
+                 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra, count);
           break;
         case ompt_work_workshare:
           //impl
           break;
         case ompt_work_distribute:
-          printf("%" PRIu64 ": ompt_event_distribute_end: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_distribute_end: parallel_id=%" PRIu64
+                 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64
+                 "\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra, count);
           break;
         case ompt_work_taskloop:
           //impl
-          printf("%" PRIu64 ": ompt_event_taskloop_end: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_taskloop_end: parallel_id=%" PRIu64
+                 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64
+                 "\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_scope:
+          printf("%" PRIu64 ":" _TOOL_PREFIX
+                 " ompt_event_scope_end: parallel_id=%" PRIu64
+                 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64
+                 "\n",
+                 ompt_get_thread_data()->value, parallel_data->value,
+                 task_data->value, codeptr_ra, count);
           break;
       }
       break;
+    case ompt_scope_beginend:
+      printf("ompt_scope_beginend should never be passed to %s\n", __func__);
+      exit(-1);
   }
 }
 
-static void
-on_ompt_callback_master(
-  ompt_scope_endpoint_t endpoint,
-  ompt_data_t *parallel_data,
-  ompt_data_t *task_data,
-  const void *codeptr_ra)
-{
+static void on_ompt_callback_masked(ompt_scope_endpoint_t endpoint,
+                                    ompt_data_t *parallel_data,
+                                    ompt_data_t *task_data,
+                                    const void *codeptr_ra) {
   switch(endpoint)
   {
     case ompt_scope_begin:
-      printf("%" PRIu64 ": ompt_event_master_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_masked_begin: parallel_id=%" PRIu64
+             ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+             ompt_get_thread_data()->value, parallel_data->value,
+             task_data->value, codeptr_ra);
       break;
     case ompt_scope_end:
-      printf("%" PRIu64 ": ompt_event_master_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+      printf("%" PRIu64 ":" _TOOL_PREFIX
+             " ompt_event_masked_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64
+             ", codeptr_ra=%p\n",
+             ompt_get_thread_data()->value, parallel_data->value,
+             task_data->value, codeptr_ra);
       break;
+    case ompt_scope_beginend:
+      printf("ompt_scope_beginend should never be passed to %s\n", __func__);
+      exit(-1);
   }
 }
 
@@ -653,7 +950,8 @@ static void on_ompt_callback_parallel_begin(
   int invoker = flag & 0xF;
   const char *event = (flag & ompt_parallel_team) ? "parallel" : "teams";
   const char *size = (flag & ompt_parallel_team) ? "team_size" : "num_teams";
-  printf("%" PRIu64 ": ompt_event_%s_begin: parent_task_id=%" PRIu64
+  printf("%" PRIu64 ":" _TOOL_PREFIX
+         " ompt_event_%s_begin: parent_task_id=%" PRIu64
          ", parent_task_frame.exit=%p, parent_task_frame.reenter=%p, "
          "parallel_id=%" PRIu64 ", requested_%s=%" PRIu32
          ", codeptr_ra=%p, invoker=%d\n",
@@ -668,7 +966,7 @@ static void on_ompt_callback_parallel_end(ompt_data_t *parallel_data,
                                           int flag, const void *codeptr_ra) {
   int invoker = flag & 0xF;
   const char *event = (flag & ompt_parallel_team) ? "parallel" : "teams";
-  printf("%" PRIu64 ": ompt_event_%s_end: parallel_id=%" PRIu64
+  printf("%" PRIu64 ":" _TOOL_PREFIX " ompt_event_%s_end: parallel_id=%" PRIu64
          ", task_id=%" PRIu64 ", invoker=%d, codeptr_ra=%p\n",
          ompt_get_thread_data()->value, event, parallel_data->value,
          encountering_task_data->value, invoker, codeptr_ra);
@@ -690,7 +988,18 @@ on_ompt_callback_task_create(
 
   format_task_type(type, buffer);
 
-  printf("%" PRIu64 ": ompt_event_task_create: parent_task_id=%" PRIu64 ", parent_task_frame.exit=%p, parent_task_frame.reenter=%p, new_task_id=%" PRIu64 ", codeptr_ra=%p, task_type=%s=%d, has_dependences=%s\n", ompt_get_thread_data()->value, encountering_task_data ? encountering_task_data->value : 0, encountering_task_frame ? encountering_task_frame->exit_frame.ptr : NULL, encountering_task_frame ? encountering_task_frame->enter_frame.ptr : NULL, new_task_data->value, codeptr_ra, buffer, type, has_dependences ? "yes" : "no");
+  printf(
+      "%" PRIu64 ":" _TOOL_PREFIX
+      " ompt_event_task_create: parent_task_id=%" PRIu64
+      ", parent_task_frame.exit=%p, parent_task_frame.reenter=%p, "
+      "new_task_id=%" PRIu64
+      ", codeptr_ra=%p, task_type=%s=%d, has_dependences=%s\n",
+      ompt_get_thread_data()->value,
+      encountering_task_data ? encountering_task_data->value : 0,
+      encountering_task_frame ? encountering_task_frame->exit_frame.ptr : NULL,
+      encountering_task_frame ? encountering_task_frame->enter_frame.ptr : NULL,
+      new_task_data->value, codeptr_ra, buffer, type,
+      has_dependences ? "yes" : "no");
 }
 
 static void
@@ -699,10 +1008,16 @@ on_ompt_callback_task_schedule(
     ompt_task_status_t prior_task_status,
     ompt_data_t *second_task_data)
 {
-  printf("%" PRIu64 ": ompt_event_task_schedule: first_task_id=%" PRIu64 ", second_task_id=%" PRIu64 ", prior_task_status=%s=%d\n", ompt_get_thread_data()->value, first_task_data->value, second_task_data->value, ompt_task_status_t_values[prior_task_status], prior_task_status);
-  if(prior_task_status == ompt_task_complete)
-  {
-    printf("%" PRIu64 ": ompt_event_task_end: task_id=%" PRIu64 "\n", ompt_get_thread_data()->value, first_task_data->value);
+  printf("%" PRIu64 ":" _TOOL_PREFIX
+         " ompt_event_task_schedule: first_task_id=%" PRIu64
+         ", second_task_id=%" PRIu64 ", prior_task_status=%s=%d\n",
+         ompt_get_thread_data()->value, first_task_data->value,
+         (second_task_data ? second_task_data->value : -1),
+         ompt_task_status_t_values[prior_task_status], prior_task_status);
+  if (prior_task_status == ompt_task_complete ||
+      prior_task_status == ompt_task_late_fulfill) {
+    printf("%" PRIu64 ":" _TOOL_PREFIX " ompt_event_task_end: task_id=%" PRIu64
+           "\n", ompt_get_thread_data()->value, first_task_data->value);
   }
 }
 
@@ -712,7 +1027,24 @@ on_ompt_callback_dependences(
   const ompt_dependence_t *deps,
   int ndeps)
 {
-  printf("%" PRIu64 ": ompt_event_task_dependences: task_id=%" PRIu64 ", deps=%p, ndeps=%d\n", ompt_get_thread_data()->value, task_data->value, (void *)deps, ndeps);
+  char buffer[2048];
+  char *progress = buffer;
+  for (int i = 0; i < ndeps && progress < buffer + 2000; i++) {
+    if (deps[i].dependence_type == ompt_dependence_type_source ||
+        deps[i].dependence_type == ompt_dependence_type_sink)
+      progress +=
+          sprintf(progress, "(%" PRIu64 ", %s), ", deps[i].variable.value,
+                  ompt_dependence_type_t_values[deps[i].dependence_type]);
+    else
+      progress +=
+          sprintf(progress, "(%p, %s), ", deps[i].variable.ptr,
+                  ompt_dependence_type_t_values[deps[i].dependence_type]);
+  }
+  if (ndeps > 0)
+    progress[-2] = 0;
+  printf("%" PRIu64 ":" _TOOL_PREFIX " ompt_event_dependences: task_id=%" PRIu64
+         ", deps=[%s], ndeps=%d\n",
+         ompt_get_thread_data()->value, task_data->value, buffer, ndeps);
 }
 
 static void
@@ -720,7 +1052,11 @@ on_ompt_callback_task_dependence(
   ompt_data_t *first_task_data,
   ompt_data_t *second_task_data)
 {
-  printf("%" PRIu64 ": ompt_event_task_dependence_pair: first_task_id=%" PRIu64 ", second_task_id=%" PRIu64 "\n", ompt_get_thread_data()->value, first_task_data->value, second_task_data->value);
+  printf("%" PRIu64 ":" _TOOL_PREFIX
+         " ompt_event_task_dependence_pair: first_task_id=%" PRIu64
+         ", second_task_id=%" PRIu64 "\n",
+         ompt_get_thread_data()->value, first_task_data->value,
+         second_task_data->value);
 }
 
 static void
@@ -731,14 +1067,19 @@ on_ompt_callback_thread_begin(
   if(thread_data->ptr)
     printf("%s\n", "0: thread_data initially not null");
   thread_data->value = ompt_get_unique_id();
-  printf("%" PRIu64 ": ompt_event_thread_begin: thread_type=%s=%d, thread_id=%" PRIu64 "\n", ompt_get_thread_data()->value, ompt_thread_t_values[thread_type], thread_type, thread_data->value);
+  printf("%" PRIu64 ":" _TOOL_PREFIX
+         " ompt_event_thread_begin: thread_type=%s=%d, thread_id=%" PRIu64 "\n",
+         ompt_get_thread_data()->value, ompt_thread_t_values[thread_type],
+         thread_type, thread_data->value);
 }
 
 static void
 on_ompt_callback_thread_end(
   ompt_data_t *thread_data)
 {
-  printf("%" PRIu64 ": ompt_event_thread_end: thread_id=%" PRIu64 "\n", ompt_get_thread_data()->value, thread_data->value);
+  printf("%" PRIu64 ":" _TOOL_PREFIX " ompt_event_thread_end: thread_id=%" PRIu64
+         "\n",
+         ompt_get_thread_data()->value, thread_data->value);
 }
 
 static int
@@ -750,7 +1091,36 @@ on_ompt_callback_control_tool(
 {
   ompt_frame_t* omptTaskFrame;
   ompt_get_task_info(0, NULL, (ompt_data_t**) NULL, &omptTaskFrame, NULL, NULL);
-  printf("%" PRIu64 ": ompt_event_control_tool: command=%" PRIu64 ", modifier=%" PRIu64 ", arg=%p, codeptr_ra=%p, current_task_frame.exit=%p, current_task_frame.reenter=%p \n", ompt_get_thread_data()->value, command, modifier, arg, codeptr_ra, omptTaskFrame->exit_frame.ptr, omptTaskFrame->enter_frame.ptr);
+  printf("%" PRIu64 ":" _TOOL_PREFIX " ompt_event_control_tool: command=%" PRIu64
+         ", modifier=%" PRIu64
+         ", arg=%p, codeptr_ra=%p, current_task_frame.exit=%p, "
+         "current_task_frame.reenter=%p \n",
+         ompt_get_thread_data()->value, command, modifier, arg, codeptr_ra,
+         omptTaskFrame->exit_frame.ptr, omptTaskFrame->enter_frame.ptr);
+
+  // the following would interfere with expected output for OMPT tests, so skip
+#ifndef _OMPT_TESTS
+  // print task data
+  int task_level = 0;
+  ompt_data_t *task_data;
+  while (ompt_get_task_info(task_level, NULL, (ompt_data_t **)&task_data, NULL,
+                            NULL, NULL)) {
+    printf("%" PRIu64 ":" _TOOL_PREFIX " task level %d: task_id=%" PRIu64 "\n",
+           ompt_get_thread_data()->value, task_level, task_data->value);
+    task_level++;
+  }
+
+  // print parallel data
+  int parallel_level = 0;
+  ompt_data_t *parallel_data;
+  while (ompt_get_parallel_info(parallel_level, (ompt_data_t **)&parallel_data,
+                                NULL)) {
+    printf("%" PRIu64 ":" _TOOL_PREFIX " parallel level %d: parallel_id=%" PRIu64
+           "\n",
+           ompt_get_thread_data()->value, parallel_level, parallel_data->value);
+    parallel_level++;
+  }
+#endif
   return 0; //success
 }
 
@@ -769,6 +1139,8 @@ int ompt_initialize(
   ompt_get_unique_id = (ompt_get_unique_id_t) lookup("ompt_get_unique_id");
   ompt_finalize_tool = (ompt_finalize_tool_t)lookup("ompt_finalize_tool");
 
+  ompt_get_unique_id();
+
   ompt_get_num_procs = (ompt_get_num_procs_t) lookup("ompt_get_num_procs");
   ompt_get_num_places = (ompt_get_num_places_t) lookup("ompt_get_num_places");
   ompt_get_place_proc_ids = (ompt_get_place_proc_ids_t) lookup("ompt_get_place_proc_ids");
@@ -784,6 +1156,7 @@ int ompt_initialize(
   register_callback(ompt_callback_nest_lock);
   register_callback(ompt_callback_sync_region);
   register_callback_t(ompt_callback_sync_region_wait, ompt_callback_sync_region_t);
+  register_callback_t(ompt_callback_reduction, ompt_callback_sync_region_t);
   register_callback(ompt_callback_control_tool);
   register_callback(ompt_callback_flush);
   register_callback(ompt_callback_cancel);
@@ -791,7 +1164,7 @@ int ompt_initialize(
   register_callback_t(ompt_callback_lock_init, ompt_callback_mutex_acquire_t);
   register_callback_t(ompt_callback_lock_destroy, ompt_callback_mutex_t);
   register_callback(ompt_callback_work);
-  register_callback(ompt_callback_master);
+  register_callback(ompt_callback_masked);
   register_callback(ompt_callback_parallel_begin);
   register_callback(ompt_callback_parallel_end);
   register_callback(ompt_callback_task_create);
@@ -823,3 +1196,6 @@ ompt_start_tool_result_t* ompt_start_tool(
 }
 #endif
 #endif // ifndef USE_PRIVATE_TOOL
+#ifdef _OMPT_TESTS
+#undef _OMPT_TESTS
+#endif
diff --git a/runtime/test/ompt/cancel/cancel_taskgroup.c b/runtime/test/ompt/cancel/cancel_taskgroup.c
index fce39c91f..23e5de7cc 100644
--- a/runtime/test/ompt/cancel/cancel_taskgroup.c
+++ b/runtime/test/ompt/cancel/cancel_taskgroup.c
@@ -58,17 +58,18 @@ int main()
     #pragma omp barrier
   }
 
-
   // Check if libomp supports the callbacks for this test.
-  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_master'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_masked'
   // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
-  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
-  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_cancel'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule' 
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_cancel' 
   // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
 
-
   // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
-  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_master_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_masked_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]],
+  // CHECK-SAME: task_id=[[PARENT_TASK_ID:[0-9]+]],
+  // CHECK-SAME: codeptr_ra={{0x[0-f]*}}
 
   // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[PARENT_TASK_ID]], parent_task_frame.exit={{0x[0-f]*}}, parent_task_frame.reenter={{0x[0-f]*}}, new_task_id=[[FIRST_TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}, task_type=ompt_task_explicit=4, has_dependences=no
   // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[PARENT_TASK_ID]], parent_task_frame.exit={{0x[0-f]*}}, parent_task_frame.reenter={{0x[0-f]*}}, new_task_id=[[SECOND_TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}, task_type=ompt_task_explicit=4, has_dependences=no
diff --git a/runtime/test/ompt/loadtool/tool_available/tool_available.c b/runtime/test/ompt/loadtool/tool_available/tool_available.c
index 25187fdb0..ed027e0b8 100644
--- a/runtime/test/ompt/loadtool/tool_available/tool_available.c
+++ b/runtime/test/ompt/loadtool/tool_available/tool_available.c
@@ -1,20 +1,75 @@
 // The OpenMP standard defines 3 ways of providing ompt_start_tool:
-// 1. "statically-linking the tool’s definition of ompt_start_tool into an OpenMP application"
-// RUN: %libomp-compile -DCODE -DTOOL && %libomp-run | FileCheck %s
+
+// 1. "statically-linking the tool’s definition of ompt_start_tool into an 
+//     OpenMP application"
+
+// RUN: %libomp-compile -DCODE -DTOOL && env OMP_TOOL_VERBOSE_INIT=stdout \
+// RUN:    %libomp-run | FileCheck %s --check-prefixes CHECK,ADDRSPACE
 
 // Note: We should compile the tool without -fopenmp as other tools developer
-//       would do. Otherwise this test may pass for the wrong reasons on Darwin.
+//      would do. Otherwise this test may pass for the wrong reasons on Darwin.
+
 // RUN: %clang %flags -DTOOL -shared -fPIC %s -o %T/tool.so
-// 2. "introducing a dynamically-linked library that includes the tool’s definition of ompt_start_tool into the application’s address space"
+
+// 2. "introducing a dynamically-linked library that includes the tool’s 
+//    definition of ompt_start_tool into the application’s address space"
+
 // 2.1 Link with tool during compilation
-// RUN: %libomp-compile -DCODE %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
+
+// RUN: %libomp-compile -DCODE %no-as-needed-flag %T/tool.so && \
+// RUN:    env OMP_TOOL_VERBOSE_INIT=stdout %libomp-run | FileCheck %s \
+// RUN:    --check-prefixes CHECK,ADDRSPACE 
+
 // 2.2 Link with tool during compilation, but AFTER the runtime
-// RUN: %libomp-compile -DCODE -lomp %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
+
+// RUN: %libomp-compile -DCODE -lomp %no-as-needed-flag %T/tool.so && \
+// RUN:    env OMP_TOOL_VERBOSE_INIT=stdout %libomp-run | FileCheck %s \
+// RUN:    --check-prefixes CHECK,ADDRSPACE
+
 // 2.3 Inject tool via the dynamic loader
-// RUN: %libomp-compile -DCODE && %preload-tool %libomp-run | FileCheck %s
 
-// 3. "providing the name of a dynamically-linked library appropriate for the architecture and operating system used by the application in the tool-libraries-var ICV"
-// RUN: %libomp-compile -DCODE && env OMP_TOOL_LIBRARIES=%T/tool.so %libomp-run | FileCheck %s
+// RUN: %libomp-compile -DCODE && env OMP_TOOL_VERBOSE_INIT=stdout \
+// RUN:    %preload-tool %libomp-run | FileCheck %s \
+// RUN:    --check-prefixes CHECK,ADDRSPACE
+
+// 3. "providing the name of a dynamically-linked library appropriate for the
+//    architecture and operating system used by the application in the 
+//    tool-libraries-var ICV"
+
+// 3.1 OMP_TOOL_VERBOSE_INIT not set 
+
+// RUN: %libomp-compile -DCODE && \
+// RUN:    env OMP_TOOL_LIBRARIES=%T/tool.so %libomp-run | FileCheck %s
+
+// 3.2 OMP_TOOL_VERBOSE_INIT disabled
+
+// RUN: env OMP_TOOL_LIBRARIES=%T/tool.so OMP_TOOL_VERBOSE_INIT=disabled \
+// RUN:    %libomp-run | FileCheck %s
+
+// 3.3 OMP_TOOL_VERBOSE_INIT to stdout
+
+// RUN: %libomp-compile -DCODE && env OMP_TOOL_LIBRARIES=%T/tool.so \
+// RUN:    OMP_TOOL_VERBOSE_INIT=stdout %libomp-run | \
+// RUN:    FileCheck %s -DPARENTPATH=%T --check-prefixes CHECK,TOOLLIB
+
+// 3.4 OMP_TOOL_VERBOSE_INIT to stderr, check merged stdout and stderr
+
+// RUN: env OMP_TOOL_LIBRARIES=%T/tool.so OMP_TOOL_VERBOSE_INIT=stderr \
+// RUN:    %libomp-run 2>&1 | \
+// RUN:    FileCheck %s -DPARENTPATH=%T --check-prefixes CHECK,TOOLLIB
+
+// 3.5 OMP_TOOL_VERBOSE_INIT to stderr, check just stderr
+
+// RUN: env OMP_TOOL_LIBRARIES=%T/tool.so OMP_TOOL_VERBOSE_INIT=stderr \
+// RUN:    %libomp-run 2>&1 >/dev/null | \
+// RUN:    FileCheck %s -DPARENTPATH=%T --check-prefixes TOOLLIB
+
+// 3.6 OMP_TOOL_VERBOSE_INIT to file "init.log"
+
+// RUN: env OMP_TOOL_LIBRARIES=%T/tool.so OMP_TOOL_VERBOSE_INIT=%T/init.log \
+// RUN:    %libomp-run | FileCheck %s && cat %T/init.log | \
+// RUN:    FileCheck %s -DPARENTPATH=%T --check-prefixes TOOLLIB
+
 
 // REQUIRES: ompt
 
@@ -25,6 +80,24 @@
  *  -DCODE enables the code for the executable during compilation
  */
 
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback
+
+// ADDRSPACE: ----- START LOGGING OF TOOL REGISTRATION -----
+// ADDRSPACE-NEXT: Search for OMP tool in current address space... Sucess.
+// ADDRSPACE-NEXT: Tool was started and is using the OMPT interface.
+// ADDRSPACE-NEXT: ----- END LOGGING OF TOOL REGISTRATION -----
+
+// TOOLLIB: ----- START LOGGING OF TOOL REGISTRATION -----
+// TOOLLIB-NEXT: Search for OMP tool in current address space... Failed.
+// TOOLLIB-NEXT: Searching tool libraries...
+// TOOLLIB-NEXT: OMP_TOOL_LIBRARIES = [[PARENTPATH]]/tool.so
+// TOOLLIB-NEXT: Opening [[PARENTPATH]]/tool.so... Success.
+// TOOLLIB-NEXT: Searching for ompt_start_tool in
+// TOOLLIB-SAME: [[PARENTPATH]]/tool.so... Success.
+// TOOLLIB-NEXT: Tool was started and is using the OMPT interface.
+// TOOLLIB-NEXT: ----- END LOGGING OF TOOL REGISTRATION -----
+
 #ifdef CODE
 #include "omp.h"
 
@@ -34,9 +107,8 @@ int main()
   {
   }
 
-
-  // Check if libomp supports the callbacks for this test.
-  // CHECK-NOT: {{^}}0: Could not register callback
+  // CHECK-NOT: ----- START LOGGING OF TOOL REGISTRATION -----
+  // CHECK-NOT: ----- END LOGGING OF TOOL REGISTRATION -----
 
   // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
   // CHECK: {{^}}0: ompt_event_runtime_shutdown
@@ -51,10 +123,8 @@ int main()
 #include <stdio.h>
 #include <omp-tools.h>
 
-int ompt_initialize(
-  ompt_function_lookup_t lookup,
-  ompt_data_t* tool_data)
-{
+int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
+                    ompt_data_t *tool_data) {
   printf("0: NULL_POINTER=%p\n", (void*)NULL);
   return 1; //success
 }
diff --git a/runtime/test/ompt/loadtool/tool_available_search/tool_available_search.c b/runtime/test/ompt/loadtool/tool_available_search/tool_available_search.c
index fedfebe39..e8823dfa3 100644
--- a/runtime/test/ompt/loadtool/tool_available_search/tool_available_search.c
+++ b/runtime/test/ompt/loadtool/tool_available_search/tool_available_search.c
@@ -1,7 +1,9 @@
 // RUN: %clang %flags -shared -fPIC %s -o %T/first_tool.so
 // RUN: %clang %flags -DTOOL -DSECOND_TOOL -shared -fPIC %s -o %T/second_tool.so
 // RUN: %clang %flags -DTOOL -DTHIRD_TOOL -shared -fPIC %s -o %T/third_tool.so
-// RUN: %libomp-compile -DCODE && env OMP_TOOL_LIBRARIES=%T/non_existing_file.so:%T/first_tool.so:%T/second_tool.so:%T/third_tool.so %libomp-run | FileCheck %s
+// RUN: %libomp-compile -DCODE
+// RUN: env OMP_TOOL_LIBRARIES=%T/non_existing_file.so:%T/first_tool.so:%T/second_tool.so:%T/third_tool.so \
+// RUN: OMP_TOOL_VERBOSE_INIT=stdout %libomp-run | FileCheck %s -DPARENTPATH=%T
 
 // REQUIRES: ompt
 
@@ -15,6 +17,42 @@
  *  -DCODE enables the code for the executable during compilation
  */
 
+// CHECK: ----- START LOGGING OF TOOL REGISTRATION -----
+// CHECK-NEXT: Search for OMP tool in current address space... Failed.
+// CHECK-NEXT: Searching tool libraries...
+// CHECK-NEXT: OMP_TOOL_LIBRARIES = [[PARENTPATH]]/non_existing_file.so
+// CHECK-SAME: [[PARENTPATH]]/first_tool.so
+// CHECK-SAME: [[PARENTPATH]]/second_tool.so
+// CHECK-SAME: [[PARENTPATH]]/third_tool.so
+// CHECK-NEXT: Opening [[PARENTPATH]]/non_existing_file.so... Failed:
+// CHECK-SAME: [[PARENTPATH]]/non_existing_file.so: cannot open shared object
+// CHECK-SAME: file: No such file or directory
+// CHECK-NEXT: Opening [[PARENTPATH]]/first_tool.so... Success.
+// CHECK-NEXT: Searching for ompt_start_tool in
+// CHECK-SAME: [[PARENTPATH]]/first_tool.so... Failed:
+// CHECK-SAME: [[PARENTPATH]]/first_tool.so: undefined symbol: ompt_start_tool
+// CHECK-NEXT: Opening [[PARENTPATH]]/second_tool.so... Success.
+// CHECK-NEXT: Searching for ompt_start_tool in
+// CHECK-SAME: [[PARENTPATH]]/second_tool.so... 0: Do not initialize tool
+// CHECK-NEXT: Found but not using the OMPT interface.
+// CHECK-NEXT: Continuing search...
+// CHECK-NEXT: Opening [[PARENTPATH]]/third_tool.so... Success.
+// CHECK-NEXT: Searching for ompt_start_tool in
+// CHECK-SAME: [[PARENTPATH]]/third_tool.so... 0: Do initialize tool
+// CHECK-NEXT: Success.
+// CHECK-NEXT: Tool was started and is using the OMPT interface.
+// CHECK-NEXT: ----- END LOGGING OF TOOL REGISTRATION -----
+
+// Check if libomp supports the callbacks for this test.
+
+// CHECK-NOT: {{^}}0: Could not register callback 
+// CHECK: {{^}}0: Tool initialized
+// CHECK: {{^}}0: ompt_event_thread_begin
+// CHECK-DAG: {{^}}0: ompt_event_thread_begin
+// CHECK-DAG: {{^}}0: control_tool()=-1
+// CHECK: {{^}}0: Tool finalized
+
+
 #ifdef CODE
 #include "stdio.h"
 #include "omp.h"
@@ -32,19 +70,6 @@ int main()
   }
 
 
-  // Check if libomp supports the callbacks for this test.
-  // CHECK-NOT: {{^}}0: Could not register callback 
-  
-  // CHECK: {{^}}0: Do not initialize tool
-
-  // CHECK: {{^}}0: Do initialize tool
-  // CHECK: {{^}}0: Tool initialized
-  // CHECK: {{^}}0: ompt_event_thread_begin
-  // CHECK-DAG: {{^}}0: ompt_event_thread_begin
-  // CHECK-DAG: {{^}}0: control_tool()=-1
-  // CHECK: {{^}}0: Tool finalized
-  
-
   return 0;
 }
 
@@ -76,10 +101,8 @@ on_ompt_callback_thread_begin(
   printf("0: ompt_event_thread_begin\n");
 }
 
-int ompt_initialize(
-  ompt_function_lookup_t lookup,
-  ompt_data_t *tool_data)
-{
+int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
+                    ompt_data_t *tool_data) {
   ompt_set_callback_t ompt_set_callback = (ompt_set_callback_t) lookup("ompt_set_callback");
   ompt_set_callback(ompt_callback_thread_begin, (ompt_callback_t)on_ompt_callback_thread_begin);
   printf("0: Tool initialized\n");
diff --git a/runtime/test/ompt/loadtool/tool_not_available/tool_not_available.c b/runtime/test/ompt/loadtool/tool_not_available/tool_not_available.c
index ea4046831..7e68d8b33 100644
--- a/runtime/test/ompt/loadtool/tool_not_available/tool_not_available.c
+++ b/runtime/test/ompt/loadtool/tool_not_available/tool_not_available.c
@@ -1,20 +1,45 @@
 // The OpenMP standard defines 3 ways of providing ompt_start_tool:
-// 1. "statically-linking the tool’s definition of ompt_start_tool into an OpenMP application"
-// RUN: %libomp-compile -DCODE -DTOOL && %libomp-run | FileCheck %s
+
+// 1. "statically-linking the tool’s definition of ompt_start_tool into an
+// OpenMP application"
+
+// RUN: %libomp-compile -DCODE -DTOOL && \
+// RUN:    env OMP_TOOL_VERBOSE_INIT=stdout %libomp-run | \
+// RUN:    FileCheck %s --check-prefixes CHECK,ADDRSPACE 
 
 // Note: We should compile the tool without -fopenmp as other tools developer
-//       would do. Otherwise this test may pass for the wrong reasons on Darwin.
+//      would do. Otherwise this test may pass for the wrong reasons on Darwin.
+
 // RUN: %clang %flags -DTOOL -shared -fPIC %s -o %T/tool.so
-// 2. "introducing a dynamically-linked library that includes the tool’s definition of ompt_start_tool into the application’s address space"
+
+// 2. "introducing a dynamically-linked library that includes the tool’s 
+//    definition of ompt_start_tool into the application’s address space"
+
 // 2.1 Link with tool during compilation
-// RUN: %libomp-compile -DCODE %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
+
+// RUN: %libomp-compile -DCODE %no-as-needed-flag %T/tool.so && \
+// RUN:    env OMP_TOOL_VERBOSE_INIT=stdout %libomp-run | \
+// RUN:    FileCheck %s --check-prefixes CHECK,ADDRSPACE
+
 // 2.2 Link with tool during compilation, but AFTER the runtime
-// RUN: %libomp-compile -DCODE -lomp %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
+
+// RUN: %libomp-compile -DCODE -lomp %no-as-needed-flag %T/tool.so && \
+// RUN:    env OMP_TOOL_VERBOSE_INIT=stdout %libomp-run | \
+// RUN:    FileCheck %s --check-prefixes CHECK,ADDRSPACE 
+
 // 2.3 Inject tool via the dynamic loader
-// RUN: %libomp-compile -DCODE && %preload-tool %libomp-run | FileCheck %s
 
-// 3. "providing the name of a dynamically-linked library appropriate for the architecture and operating system used by the application in the tool-libraries-var ICV"
-// RUN: %libomp-compile -DCODE && env OMP_TOOL_LIBRARIES=%T/tool.so %libomp-run | FileCheck %s
+// RUN: %libomp-compile -DCODE && \
+// RUN:    env OMP_TOOL_VERBOSE_INIT=stdout %preload-tool %libomp-run | \
+// RUN:    FileCheck %s --check-prefixes CHECK,ADDRSPACE 
+
+// 3. "providing the name of a dynamically-linked library appropriate for the
+//    architecture and operating system used by the application in the 
+//    tool-libraries-var ICV"
+
+// RUN: %libomp-compile -DCODE && env OMP_TOOL_LIBRARIES=%T/tool.so \
+// RUN:    OMP_TOOL_VERBOSE_INIT=stdout %libomp-run | \
+// RUN:    FileCheck %s -DPARENTPATH=%T --check-prefixes CHECK,TOOLLIB
 
 // REQUIRES: ompt
 
@@ -43,9 +68,33 @@ int main()
 
 
   // Check if libomp supports the callbacks for this test.
-  // CHECK-NOT: {{^}}0: Could not register callback 
-  
-  // CHECK: {{^}}0: Do not initialize tool
+  // CHECK-NOT: {{^}}0: Could not register callback
+
+  // ADDRSPACE: ----- START LOGGING OF TOOL REGISTRATION -----
+  // ADDRSPACE-NEXT: Search for OMP tool in current address space...
+
+  // TOOLLIB: ----- START LOGGING OF TOOL REGISTRATION -----
+  // TOOLLIB-NEXT: Search for OMP tool in current address space... Failed.
+  // TOOLLIB-NEXT: Searching tool libraries...
+  // TOOLLIB-NEXT: OMP_TOOL_LIBRARIES = [[PARENTPATH]]/tool.so
+  // TOOLLIB-NEXT: Opening [[PARENTPATH]]/tool.so... Success.
+  // TOOLLIB-NEXT: Searching for ompt_start_tool in
+  // TOOLLIB-SAME: [[PARENTPATH]]/tool.so...
+
+  // CHECK: 0: Do not initialize tool
+
+  // ADDRSPACE-NEXT: Failed.
+  // ADDRSPACE-NEXT: No OMP_TOOL_LIBRARIES defined.
+  // ADDRSPACE-NEXT: ...searching tool libraries failed.
+  // ADDRSPACE: No OMP tool loaded.
+  // ADDRSPACE-NEXT: ----- END LOGGING OF TOOL REGISTRATION -----
+
+  // TOOLLIB-NEXT: Found but not using the OMPT interface.
+  // TOOLLIB-NEXT: Continuing search...
+  // TOOLLIB-NEXT: ...searching tool libraries failed.
+  // TOOLLIB: No OMP tool loaded.
+  // TOOLLIB-NEXT: ----- END LOGGING OF TOOL REGISTRATION -----
+
   // CHECK: {{^}}0: control_tool()=-2
   
 
diff --git a/runtime/test/ompt/misc/api_calls_without_ompt.c b/runtime/test/ompt/misc/api_calls_without_ompt.c
index e66aecd1d..f76446ef7 100644
--- a/runtime/test/ompt/misc/api_calls_without_ompt.c
+++ b/runtime/test/ompt/misc/api_calls_without_ompt.c
@@ -109,7 +109,8 @@ int main() {
   return 0;
 }
 
-int ompt_initialize(ompt_function_lookup_t lookup, ompt_data_t *tool_data) {
+int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
+                    ompt_data_t *tool_data) {
   ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
   ompt_get_callback = (ompt_get_callback_t)lookup("ompt_get_callback");
   ompt_get_state = (ompt_get_state_t)lookup("ompt_get_state");
diff --git a/runtime/test/ompt/parallel/repeated_calls.c b/runtime/test/ompt/parallel/repeated_calls.c
new file mode 100644
index 000000000..182697530
--- /dev/null
+++ b/runtime/test/ompt/parallel/repeated_calls.c
@@ -0,0 +1,102 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+
+#define USE_PRIVATE_TOOL 1
+#include "callback.h"
+
+__attribute__((noinline))
+int foo(int x) {
+#pragma omp parallel num_threads(2)
+  {
+#pragma omp atomic
+    x++;
+  }
+  return x;
+}
+
+__attribute__((noinline))
+int bar(int x) {
+#pragma omp parallel num_threads(2)
+  {
+#pragma omp critical
+    x++;
+  }
+  return x;
+}
+
+int main() {
+  int y;
+  y = foo(y);
+  y = bar(y);
+  y = foo(y);
+  return 0;
+
+  // CHECK-NOT: {{^}}0: Could not register callback
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // First call to foo
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin
+  // CHECK-SAME: {{.*}}codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+
+  // Call to bar
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin
+
+  // Second call to foo
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin
+  // CHECK-SAME: {{.*}}codeptr_ra=[[RETURN_ADDRESS]]
+
+}
+
+static void on_ompt_callback_thread_begin(
+    ompt_thread_t thread_type,
+    ompt_data_t *thread_data) {
+  if (thread_data->ptr)
+    printf("%s\n", "0: thread_data initially not null");
+  thread_data->value = ompt_get_unique_id();
+  printf("%" PRIu64 ":" _TOOL_PREFIX
+         " ompt_event_thread_begin: thread_type=%s=%d, thread_id=%" PRIu64 "\n",
+         ompt_get_thread_data()->value, ompt_thread_t_values[thread_type],
+         thread_type, thread_data->value);
+}
+
+static void on_ompt_callback_parallel_begin(
+    ompt_data_t *encountering_task_data,
+    const ompt_frame_t *encountering_task_frame, ompt_data_t *parallel_data,
+    uint32_t requested_team_size, int flag, const void *codeptr_ra) {
+  if (parallel_data->ptr)
+    printf("0: parallel_data initially not null\n");
+  parallel_data->value = ompt_get_unique_id();
+  int invoker = flag & 0xF;
+  const char *event = (flag & ompt_parallel_team) ? "parallel" : "teams";
+  const char *size = (flag & ompt_parallel_team) ? "team_size" : "num_teams";
+  printf("%" PRIu64 ":" _TOOL_PREFIX
+         " ompt_event_%s_begin: parent_task_id=%" PRIu64
+         ", parent_task_frame.exit=%p, parent_task_frame.reenter=%p, "
+         "parallel_id=%" PRIu64 ", requested_%s=%" PRIu32
+         ", codeptr_ra=%p, invoker=%d\n",
+         ompt_get_thread_data()->value, event, encountering_task_data->value,
+         encountering_task_frame->exit_frame.ptr,
+         encountering_task_frame->enter_frame.ptr, parallel_data->value, size,
+         requested_team_size, codeptr_ra, invoker);
+}
+
+int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
+                    ompt_data_t *tool_data) {
+  ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
+  ompt_get_unique_id = (ompt_get_unique_id_t)lookup("ompt_get_unique_id");
+  ompt_get_thread_data = (ompt_get_thread_data_t)lookup("ompt_get_thread_data");
+
+  register_callback(ompt_callback_thread_begin);
+  register_callback(ompt_callback_parallel_begin);
+  printf("0: NULL_POINTER=%p\n", (void *)NULL);
+  return 1; // success
+}
+
+void ompt_finalize(ompt_data_t *tool_data) {}
+
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+                                          const char *runtime_version) {
+  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
+                                                            &ompt_finalize, 0};
+  return &ompt_start_tool_result;
+}
diff --git a/runtime/test/ompt/synchronization/barrier/implicit_task_data.c b/runtime/test/ompt/synchronization/barrier/implicit_task_data.c
index 71c2b1540..21dddb594 100644
--- a/runtime/test/ompt/synchronization/barrier/implicit_task_data.c
+++ b/runtime/test/ompt/synchronization/barrier/implicit_task_data.c
@@ -3,7 +3,7 @@
 
 // This test checks that values stored in task_data in a barrier_begin event
 // are still present in the corresponding barrier_end event.
-// Therefore, callback implementations different from the ones in callback.h are neccessary.
+// Therefore, callback implementations different from the ones in callback.h are necessary.
 // This is a test for an issue reported in 
 // https://github.com/OpenMPToolsInterface/LLVM-openmp/issues/39
 
@@ -87,6 +87,9 @@ on_ompt_callback_sync_region(
       if (kind == ompt_sync_region_barrier_implicit)
         printf("%" PRIu64 ": ompt_event_barrier_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
       break;
+    case ompt_scope_beginend:
+      printf("ompt_scope_beginend should never be passed to %s\n", __func__);
+      exit(-1);
   }
 }
 
@@ -112,6 +115,9 @@ on_ompt_callback_sync_region_wait(
       if (kind == ompt_sync_region_barrier_implicit)
         printf("%" PRIu64 ": ompt_event_wait_barrier_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
       break;
+    case ompt_scope_beginend:
+      printf("ompt_scope_beginend should never be passed to %s\n", __func__);
+      exit(-1);
   }
 }
 
@@ -125,10 +131,8 @@ do{                                                           \
 
 #define register_callback(name) register_callback_t(name, name##_t)
 
-int ompt_initialize(
-  ompt_function_lookup_t lookup,
-  ompt_data_t *tool_data)
-{
+int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
+                    ompt_data_t *tool_data) {
   ompt_set_callback_t ompt_set_callback;
   ompt_set_callback = (ompt_set_callback_t) lookup("ompt_set_callback");
   ompt_get_unique_id = (ompt_get_unique_id_t) lookup("ompt_get_unique_id");
diff --git a/runtime/test/ompt/synchronization/lock.c b/runtime/test/ompt/synchronization/lock.c
index 2a934ee85..6a4f88a11 100644
--- a/runtime/test/ompt/synchronization/lock.c
+++ b/runtime/test/ompt/synchronization/lock.c
@@ -5,7 +5,7 @@
 
 int main()
 {
-  //need to use an OpenMP construct so that OMPT will be initalized
+  //need to use an OpenMP construct so that OMPT will be initialized
   #pragma omp parallel num_threads(1)
     print_ids(0);
 
diff --git a/runtime/test/ompt/synchronization/masked.c b/runtime/test/ompt/synchronization/masked.c
new file mode 100644
index 000000000..3eb45d959
--- /dev/null
+++ b/runtime/test/ompt/synchronization/masked.c
@@ -0,0 +1,38 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+// GCC generates code that does not call the runtime for the master construct
+// XFAIL: gcc
+
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+  int x = 0;
+#pragma omp parallel num_threads(2)
+  {
+#pragma omp master
+    {
+      print_fuzzy_address(1);
+      x++;
+    }
+    print_current_address(2);
+  }
+
+  printf("%" PRIu64 ": x=%d\n", ompt_get_thread_data()->value, x);
+
+  return 0;
+}
+
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_masked'
+
+// CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_masked_begin:
+// CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[TASK_ID:[0-9]+]],
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+// CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_masked_end:
+// CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]],
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS_END:0x[0-f]+]]
+// CHECK: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS_END]]
diff --git a/runtime/test/ompt/synchronization/master.c b/runtime/test/ompt/synchronization/master.c
index 8cc2d46a7..2d74250fd 100644
--- a/runtime/test/ompt/synchronization/master.c
+++ b/runtime/test/ompt/synchronization/master.c
@@ -3,6 +3,7 @@
 // GCC generates code that does not call the runtime for the master construct
 // XFAIL: gcc
 
+#define USE_PRIVATE_TOOL 1
 #include "callback.h"
 #include <omp.h>
 
@@ -23,16 +24,66 @@ int main() {
   return 0;
 }
 
+static void on_ompt_callback_master(ompt_scope_endpoint_t endpoint,
+                                    ompt_data_t *parallel_data,
+                                    ompt_data_t *task_data,
+                                    const void *codeptr_ra) {
+  switch (endpoint) {
+  case ompt_scope_begin:
+    printf("%" PRIu64 ":" _TOOL_PREFIX
+           " ompt_event_master_begin: codeptr_ra=%p\n",
+           ompt_get_thread_data()->value, codeptr_ra);
+    break;
+  case ompt_scope_end:
+    printf("%" PRIu64 ":" _TOOL_PREFIX
+           " ompt_event_master_end: codeptr_ra=%p\n",
+           ompt_get_thread_data()->value, codeptr_ra);
+    break;
+  case ompt_scope_beginend:
+    printf("ompt_scope_beginend should never be passed to %s\n", __func__);
+    exit(-1);
+  }
+}
+
+static void on_ompt_callback_thread_begin(ompt_thread_t thread_type,
+                                          ompt_data_t *thread_data) {
+  if (thread_data->ptr)
+    printf("%s\n", "0: thread_data initially not null");
+  thread_data->value = ompt_get_unique_id();
+  printf("%" PRIu64 ":" _TOOL_PREFIX
+         " ompt_event_thread_begin: thread_type=%s=%d, thread_id=%" PRIu64 "\n",
+         ompt_get_thread_data()->value, ompt_thread_t_values[thread_type],
+         thread_type, thread_data->value);
+}
+
+int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
+                    ompt_data_t *tool_data) {
+  ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
+  ompt_get_unique_id = (ompt_get_unique_id_t)lookup("ompt_get_unique_id");
+  ompt_get_thread_data = (ompt_get_thread_data_t)lookup("ompt_get_thread_data");
+
+  register_callback(ompt_callback_master);
+  printf("0: NULL_POINTER=%p\n", (void *)NULL);
+  return 1; // success
+}
+
+void ompt_finalize(ompt_data_t *tool_data) {}
+
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+                                          const char *runtime_version) {
+  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
+                                                            &ompt_finalize, 0};
+  return &ompt_start_tool_result;
+}
+
 // Check if libomp supports the callbacks for this test.
 // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_master'
 
 // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
 
 // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_master_begin:
-// CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[TASK_ID:[0-9]+]],
 // CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
 // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
 // CHECK: {{^}}[[MASTER_ID]]: ompt_event_master_end:
-// CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]],
 // CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS_END:0x[0-f]+]]
 // CHECK: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS_END]]
diff --git a/runtime/test/ompt/synchronization/nest_lock.c b/runtime/test/ompt/synchronization/nest_lock.c
index 159048e2e..39681ae40 100644
--- a/runtime/test/ompt/synchronization/nest_lock.c
+++ b/runtime/test/ompt/synchronization/nest_lock.c
@@ -5,12 +5,13 @@
 
 int main()
 {
-  //need to use an OpenMP construct so that OMPT will be initalized
+  //need to use an OpenMP construct so that OMPT will be initialized
   #pragma omp parallel num_threads(1)
     print_ids(0);
 
   omp_nest_lock_t nest_lock;
-  printf("%" PRIu64 ": &nest_lock: %lli\n", ompt_get_thread_data()->value, (ompt_wait_id_t)(uintptr_t) &nest_lock);
+  printf("%" PRIu64 ": &nest_lock: %" PRIu64 "\n",
+         ompt_get_thread_data()->value, (ompt_wait_id_t)(uintptr_t)&nest_lock);
   omp_init_nest_lock(&nest_lock);
   print_fuzzy_address(1);
   omp_set_nest_lock(&nest_lock);
diff --git a/runtime/test/ompt/synchronization/ordered_dependences.c b/runtime/test/ompt/synchronization/ordered_dependences.c
new file mode 100644
index 000000000..dc9f0824d
--- /dev/null
+++ b/runtime/test/ompt/synchronization/ordered_dependences.c
@@ -0,0 +1,61 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+  int a[10][10];
+  int i, j;
+#pragma omp parallel num_threads(2)
+#pragma omp for ordered(2)
+  for (i = 0; i < 2; i++)
+    for (j = 0; j < 2; j++) {
+      a[i][j] = i + j + 1;
+      printf("%d, %d\n", i, j);
+#pragma omp ordered depend(sink : i - 1, j) depend(sink : i, j - 1)
+      if (i > 0 && j > 0)
+        a[i][j] = a[i - 1][j] + a[i][j - 1] + 1;
+      printf("%d, %d\n", i, j);
+#pragma omp ordered depend(source)
+    }
+
+  return 0;
+}
+// CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+// CHECK: {{^}}[[MASTER:[0-9]+]]: ompt_event_loop_begin:
+// CHECK-SAME: parallel_id={{[0-9]+}}, parent_task_id=[[ITASK:[0-9]+]],
+
+// CHECK: {{^}}[[MASTER]]: ompt_event_dependences: task_id=[[ITASK]],
+// CHECK-SAME: deps=[(0, ompt_dependence_type_source), (0,
+// CHECK-SAME: ompt_dependence_type_source)], ndeps=2
+
+// CHECK: {{^}}[[MASTER]]: ompt_event_dependences: task_id=[[ITASK]],
+// CHECK-SAME: deps=[(0, ompt_dependence_type_sink), (0,
+// CHECK-SAME: ompt_dependence_type_sink)], ndeps=2
+
+// CHECK: {{^}}[[MASTER]]: ompt_event_dependences: task_id=[[ITASK]],
+// CHECK-SAME: deps=[(0, ompt_dependence_type_source), (1,
+// CHECK-SAME: ompt_dependence_type_source)], ndeps=2
+
+// CHECK: {{^}}[[WORKER:[0-9]+]]: ompt_event_loop_begin:
+// CHECK-SAME: parallel_id={{[0-9]+}}, parent_task_id=[[ITASK:[0-9]+]],
+
+// CHECK: {{^}}[[WORKER]]: ompt_event_dependences: task_id=[[ITASK]],
+// CHECK-SAME: deps=[(0, ompt_dependence_type_sink), (0,
+// CHECK-SAME: ompt_dependence_type_sink)], ndeps=2
+
+// CHECK: {{^}}[[WORKER]]: ompt_event_dependences: task_id=[[ITASK]],
+// CHECK-SAME: deps=[(1, ompt_dependence_type_source), (0,
+// CHECK-SAME: ompt_dependence_type_source)], ndeps=2
+
+// either can be first for last iteration
+
+// CHECK-DAG: [[ITASK]]{{.*}}deps=[(0{{.*}}sink), (1,{{.*}}sink)]
+
+// CHECK-DAG: [[ITASK]]{{.*}}deps=[(1{{.*}}sink), (0,{{.*}}sink)]
+
+// CHECK: {{^}}[[WORKER]]: ompt_event_dependences: task_id=[[ITASK]],
+// CHECK-SAME: deps=[(1, ompt_dependence_type_source), (1,
+// CHECK-SAME: ompt_dependence_type_source)], ndeps=2
diff --git a/runtime/test/ompt/synchronization/reduction/empty_reduce.c b/runtime/test/ompt/synchronization/reduction/empty_reduce.c
new file mode 100644
index 000000000..ca032984d
--- /dev/null
+++ b/runtime/test/ompt/synchronization/reduction/empty_reduce.c
@@ -0,0 +1,38 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile -DNOWAIT && %libomp-run | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc
+#include "callback.h"
+#include <omp.h>
+
+#ifdef NOWAIT
+#define FOR_CLAUSE nowait
+#else
+#define FOR_CLAUSE
+#endif
+
+int main() {
+  int sum = 0;
+  int i;
+#pragma omp parallel num_threads(1)
+#pragma omp for reduction(+ : sum) FOR_CLAUSE
+  for (i = 0; i < 10000; i++) {
+    sum += i;
+  }
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID:[0-9]+]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_reduction_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]],
+  // CHECK-SAME: codeptr_ra=
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_reduction_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]],
+  // CHECK-SAME: task_id=[[TASK_ID]], codeptr_ra=
+
+  return 0;
+}
diff --git a/runtime/test/ompt/synchronization/reduction/tree_reduce.c b/runtime/test/ompt/synchronization/reduction/tree_reduce.c
new file mode 100644
index 000000000..847abc109
--- /dev/null
+++ b/runtime/test/ompt/synchronization/reduction/tree_reduce.c
@@ -0,0 +1,52 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// RUN: %libomp-compile -DNOWAIT && %libomp-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc
+#include "callback.h"
+#include <omp.h>
+
+#ifdef NOWAIT
+#define FOR_CLAUSE nowait
+#else
+#define FOR_CLAUSE
+#endif
+
+int main() {
+  int sum = 0, a = 0, b = 0;
+  int i;
+#pragma omp parallel num_threads(5)
+// for 32-bit architecture we need at least 3 variables to trigger tree
+#pragma omp for reduction(+ : sum, a, b) FOR_CLAUSE
+  for (i = 0; i < 10000; i++) {
+    a = b = sum += i;
+  }
+
+
+  printf("%i\n", sum);
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID:[0-9]+]]
+
+  // order and distribution to threads not determined
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+
+  return 0;
+}
diff --git a/runtime/test/ompt/synchronization/taskgroup.c b/runtime/test/ompt/synchronization/taskgroup.c
index 7309c0ad5..822fde027 100644
--- a/runtime/test/ompt/synchronization/taskgroup.c
+++ b/runtime/test/ompt/synchronization/taskgroup.c
@@ -29,7 +29,6 @@ int main()
 
 
   // Check if libomp supports the callbacks for this test.
-  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_master'
   // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
   // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
   // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_cancel'
diff --git a/runtime/test/ompt/tasks/dependences.c b/runtime/test/ompt/tasks/dependences.c
index 57b61f9b5..16732e3fe 100644
--- a/runtime/test/ompt/tasks/dependences.c
+++ b/runtime/test/ompt/tasks/dependences.c
@@ -3,59 +3,88 @@
 // UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
 
 #include "callback.h"
-#include <omp.h>   
+#include <omp.h>
 #include <math.h>
 #include <unistd.h>
 
-int main()
-{
+int main() {
   int x = 0;
-  #pragma omp parallel num_threads(2)
+  int condition=0;
+#pragma omp parallel num_threads(2)
   {
-    #pragma omp master
-    {  
+#pragma omp master
+    {
       print_ids(0);
-      #pragma omp task depend(out:x)
+      printf("%" PRIu64 ": address of x: %p\n", ompt_get_thread_data()->value,
+             &x);
+#pragma omp task depend(out : x) shared(condition)
       {
         x++;
-        delay(100);
+        OMPT_WAIT(condition,1);
       }
       print_fuzzy_address(1);
       print_ids(0);
-    
-      #pragma omp task depend(in:x)
-      {
-        x = -1;
-      }
+
+#pragma omp task depend(in : x)
+      { x = -1; }
       print_ids(0);
+      OMPT_SIGNAL(condition);
     }
   }
 
   x++;
 
+  return 0;
+}
 
-  // Check if libomp supports the callbacks for this test.
-  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
-  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_dependences'
-  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_dependence'
-  
-  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_dependences'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_depende
 
-  // make sure initial data pointers are null
-  // CHECK-NOT: 0: new_task_data initially not null
+// CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
 
-  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
-  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter={{0x[0-f]+}}, new_task_id=[[FIRST_TASK:[0-f]+]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, task_type=ompt_task_explicit=4, has_dependences=yes
-  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_dependences: task_id=[[FIRST_TASK]], deps={{0x[0-f]+}}, ndeps=1
-  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
-  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+// make sure initial data pointers are null
+// CHECK-NOT: 0: new_task_data initially not null
 
-  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter={{0x[0-f]+}}, new_task_id=[[SECOND_TASK:[0-f]+]], codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit=4, has_dependences=yes
-  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_dependences: task_id=[[SECOND_TASK]], deps={{0x[0-f]+}}, ndeps=1
-  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_dependence_pair: first_task_id=[[FIRST_TASK]], second_task_id=[[SECOND_TASK]]
-  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]],
+// CHECK-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
 
+// CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// CHECK-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT:0x[0-f]+]],
+// CHECK-SAME: reenter_frame=[[NULL]]
 
-  return 0;
-}
+// CHECK: {{^}}[[MASTER_ID]]: address of x: [[ADDRX:0x[0-f]+]]
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+// CHECK-SAME: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[EXIT]],
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: new_task_id=[[FIRST_TASK:[0-f]+]],
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}},
+// CHECK-SAME: task_type=ompt_task_explicit=4, has_dependences=yes
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_dependences:
+// CHECK-SAME: task_id=[[FIRST_TASK]], deps=[([[ADDRX]],
+// CHECK-SAME: ompt_dependence_type_inout)], ndeps=1
+
+// CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+// CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// CHECK-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]],
+// CHECK-SAME: reenter_frame=[[NULL]]
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+// CHECK-SAME: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[EXIT]],
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: new_task_id=[[SECOND_TASK:[0-f]+]], codeptr_ra={{0x[0-f]+}},
+// CHECK-SAME: task_type=ompt_task_explicit=4, has_dependences=yes
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_dependences:
+// CHECK-SAME: task_id=[[SECOND_TASK]], deps=[([[ADDRX]],
+// CHECK-SAME: ompt_dependence_type_in)], ndeps=1
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_dependence_pair:
+// CHECK-SAME: first_task_id=[[FIRST_TASK]], second_task_id=[[SECOND_TASK]]
+
+// CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// CHECK-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]],
+// CHECK-SAME: reenter_frame=[[NULL]]
diff --git a/runtime/test/ompt/tasks/dependences_mutexinoutset.c b/runtime/test/ompt/tasks/dependences_mutexinoutset.c
new file mode 100644
index 000000000..4dcdc0730
--- /dev/null
+++ b/runtime/test/ompt/tasks/dependences_mutexinoutset.c
@@ -0,0 +1,115 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+
+// GCC 9 introduced codegen for mutexinoutset
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7, gcc-8
+
+// clang 9 introduced codegen for mutexinoutset
+// UNSUPPORTED: clang-4, clang-5, clang-6, clang-7, clang-8
+// UNSUPPORTED: icc-19
+
+#include "callback.h"
+#include <omp.h>
+#include <math.h>
+#include <unistd.h>
+
+int main() {
+  int x = 0;
+#pragma omp parallel num_threads(2)
+  {
+#pragma omp master
+    {
+      print_ids(0);
+      printf("%" PRIu64 ": address of x: %p\n", ompt_get_thread_data()->value,
+             &x);
+#pragma omp task depend(out : x)
+      {
+        x++;
+        delay(100);
+      }
+      print_fuzzy_address(1);
+      print_ids(0);
+
+#pragma omp task depend(mutexinoutset : x)
+      {
+        x++;
+        delay(100);
+      }
+      print_fuzzy_address(2);
+      print_ids(0);
+
+#pragma omp task depend(in : x)
+      { x = -1; }
+      print_ids(0);
+    }
+  }
+
+  x++;
+
+  return 0;
+}
+
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_dependences'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_depende
+
+// CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+// make sure initial data pointers are null
+// CHECK-NOT: 0: new_task_data initially not null
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]],
+// CHECK-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+
+// CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// CHECK-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT:0x[0-f]+]],
+// CHECK-SAME: reenter_frame=[[NULL]]
+
+// CHECK: {{^}}[[MASTER_ID]]: address of x: [[ADDRX:0x[0-f]+]]
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+// CHECK-SAME: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[EXIT]],
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: new_task_id=[[FIRST_TASK:[0-f]+]],
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}},
+// CHECK-SAME: task_type=ompt_task_explicit=4, has_dependences=yes
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_dependences:
+// CHECK-SAME: task_id=[[FIRST_TASK]], deps=[([[ADDRX]],
+// CHECK-SAME: ompt_dependence_type_inout)], ndeps=1
+
+// CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+// CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// CHECK-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]],
+// CHECK-SAME: reenter_frame=[[NULL]]
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+// CHECK-SAME: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[EXIT]],
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: new_task_id=[[SECOND_TASK:[0-f]+]],
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}},
+// CHECK-SAME: task_type=ompt_task_explicit=4, has_dependences=yes
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_dependences:
+// CHECK-SAME: task_id=[[SECOND_TASK]], deps=[([[ADDRX]],
+// CHECK-SAME: ompt_dependence_type_mutexinoutset)], ndeps=1
+
+// CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+// CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// CHECK-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]],
+// CHECK-SAME: reenter_frame=[[NULL]]
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+// CHECK-SAME: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[EXIT]],
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: new_task_id=[[THIRD_TASK:[0-f]+]], codeptr_ra={{0x[0-f]+}},
+// CHECK-SAME: task_type=ompt_task_explicit=4, has_dependences=yes
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_dependences:
+// CHECK-SAME: task_id=[[THIRD_TASK]], deps=[([[ADDRX]],
+// CHECK-SAME: ompt_dependence_type_in)], ndeps=1
+
+// CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// CHECK-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]],
+// CHECK-SAME: reenter_frame=[[NULL]]
diff --git a/runtime/test/ompt/tasks/serialized.c b/runtime/test/ompt/tasks/serialized.c
index a2c102ac5..1ce0b17a3 100644
--- a/runtime/test/ompt/tasks/serialized.c
+++ b/runtime/test/ompt/tasks/serialized.c
@@ -22,12 +22,15 @@ int main() {
       int t = (int)sin(0.1);
 #pragma omp task if (t)
       {
-        void *task_frame = get_frame_address(0);
-        if (creator_frame == task_frame) {
-          // Assume this code was inlined which the compiler is allowed to do.
+        if (creator_frame == get_frame_address(0)) {
+          printf("Assume this code was inlined which the compiler is allowed to do:\n");
           print_frame(0);
+        } else if (creator_frame == get_frame_address(1)) {
+          printf("Assume this code was called from the application:\n");
+          print_frame(1);
         } else {
           // The exit frame must be our parent!
+          printf("Assume this code was not inlined, exit frame must be our parent:\n");
           print_frame_from_outlined_fn(1);
         }
         print_ids(0);
diff --git a/runtime/test/ompt/tasks/task_early_fulfill.c b/runtime/test/ompt/tasks/task_early_fulfill.c
new file mode 100644
index 000000000..e1324e6af
--- /dev/null
+++ b/runtime/test/ompt/tasks/task_early_fulfill.c
@@ -0,0 +1,69 @@
+// RUN: %libomp-compile -fopenmp-version=50 && env OMP_NUM_THREADS='3' \
+// RUN:    %libomp-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+
+// Checked gcc 10.1 still does not support detach clause on task construct.
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7, gcc-8, gcc-9, gcc-10
+// clang supports detach clause since version 11.
+// UNSUPPORTED: clang-10, clang-9, clang-8, clang-7
+// icc compiler does not support detach clause.
+// UNSUPPORTED: icc
+
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+#pragma omp parallel
+#pragma omp master
+  {
+    omp_event_handle_t event;
+#pragma omp task detach(event) if (0)
+    { omp_fulfill_event(event); }
+#pragma omp taskwait
+  }
+  return 0;
+}
+
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+// CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+// CHECK-SAME: parent_task_id=[[PARENT_TASK_ID:[0-9]+]],
+// CHECK-SAME: parent_task_frame.exit=[[NULL]],
+// CHECK-SAME: parent_task_frame.reenter=0x{{[0-f]+}},
+// CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]],
+// CHECK-SAME: requested_team_size=3,
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+// CHECK-SAME: parallel_id=[[PARALLEL_ID]],
+// CHECK-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+// CHECK-SAME: parent_task_id=[[IMPLICIT_TASK_ID]],
+// CHECK-SAME: parent_task_frame.exit=0x{{[0-f]+}},
+// CHECK-SAME: parent_task_frame.reenter=0x{{[0-f]+}},
+// CHECK-SAME: new_task_id=[[TASK_ID:[0-9]+]],
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_task_schedule:
+// CHECK-SAME: first_task_id=[[IMPLICIT_TASK_ID]],
+// CHECK-SAME: second_task_id=[[TASK_ID]],
+// CHECK-SAME: prior_task_status=ompt_task_switch=7
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_task_schedule:
+// CHECK-SAME: first_task_id=[[TASK_ID]],
+// CHECK-SAME: second_task_id=18446744073709551615,
+// CHECK-SAME: prior_task_status=ompt_task_early_fulfill=5
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_task_schedule:
+// CHECK-SAME: first_task_id=[[TASK_ID]],
+// CHECK-SAME: second_task_id=[[IMPLICIT_TASK_ID]],
+// CHECK-SAME: prior_task_status=ompt_task_complete=1
diff --git a/runtime/test/ompt/tasks/task_if0-depend.c b/runtime/test/ompt/tasks/task_if0-depend.c
new file mode 100644
index 000000000..2ecbf02fa
--- /dev/null
+++ b/runtime/test/ompt/tasks/task_if0-depend.c
@@ -0,0 +1,75 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+  int x = 0;
+#pragma omp parallel num_threads(2)
+  {
+#pragma omp master
+    {
+      print_ids(0);
+      printf("%" PRIu64 ": address of x: %p\n", ompt_get_thread_data()->value,
+             &x);
+#pragma omp task depend(out : x)
+      { x++; }
+      print_fuzzy_address(1);
+#pragma omp task if (0) depend(in : x)
+      {}
+      print_fuzzy_address(2);
+    }
+  }
+
+  return 0;
+}
+
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_dependences'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_depende
+
+// CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+// make sure initial data pointers are null
+// CHECK-NOT: 0: new_task_data initially not null
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]],
+// CHECK-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+
+// CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// CHECK-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT:0x[0-f]+]],
+// CHECK-SAME: reenter_frame=[[NULL]]
+
+// CHECK: {{^}}[[MASTER_ID]]: address of x: [[ADDRX:0x[0-f]+]]
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+// CHECK-SAME: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[EXIT]],
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: new_task_id=[[FIRST_TASK:[0-f]+]],
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}},
+// CHECK-SAME: task_type=ompt_task_explicit=4, has_dependences=yes
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_dependences:
+// CHECK-SAME: task_id=[[FIRST_TASK]], deps=[([[ADDRX]],
+// CHECK-SAME: ompt_dependence_type_inout)], ndeps=1
+
+// CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+// CHECK-SAME: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[EXIT]],
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: new_task_id=[[SECOND_TASK:[0-f]+]],
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}},
+// CHECK-SAME: task_type=ompt_task_explicit|ompt_task_undeferred|
+// CHECK-SAME: ompt_task_mergeable=1207959556, has_dependences=yes
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_dependences:
+// CHECK-SAME: task_id=[[SECOND_TASK]], deps=[([[ADDRX]],
+// CHECK-SAME: ompt_dependence_type_in)], ndeps=1
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_end: task_id=[[SECOND_TASK]]
+
+// CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
diff --git a/runtime/test/ompt/tasks/task_late_fulfill.c b/runtime/test/ompt/tasks/task_late_fulfill.c
new file mode 100644
index 000000000..13a2a54a6
--- /dev/null
+++ b/runtime/test/ompt/tasks/task_late_fulfill.c
@@ -0,0 +1,84 @@
+// RUN: %libomp-compile -fopenmp-version=50 && env OMP_NUM_THREADS='3' \
+// RUN:    %libomp-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+
+// Checked gcc 10.1 still does not support detach clause on task construct.
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7, gcc-8, gcc-9, gcc-10
+// clang supports detach clause since version 11.
+// UNSUPPORTED: clang-10, clang-9, clang-8, clang-7
+// icc compiler does not support detach clause.
+// UNSUPPORTED: icc
+
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+#pragma omp parallel
+#pragma omp master
+  {
+    omp_event_handle_t event;
+    omp_event_handle_t *f_event;
+#pragma omp task detach(event) depend(out : f_event) shared(f_event) if (0)
+    {
+      printf("task 1\n");
+      f_event = &event;
+    }
+#pragma omp task depend(in : f_event)
+    { printf("task 2\n"); }
+    printf("calling omp_fulfill_event\n");
+    omp_fulfill_event(*f_event);
+#pragma omp taskwait
+  }
+  return 0;
+}
+
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+// CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+// CHECK-SAME: parent_task_id=[[PARENT_TASK_ID:[0-9]+]],
+// CHECK-SAME: parent_task_frame.exit=[[NULL]],
+// CHECK-SAME: parent_task_frame.reenter=0x{{[0-f]+}},
+// CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]],
+// CHECK-SAME: requested_team_size=3,
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+// CHECK-SAME: parallel_id=[[PARALLEL_ID]],
+// CHECK-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+
+// The following is to match the taskwait task created in __kmpc_omp_wait_deps
+// this should go away, once codegen for "detached if(0)" is fixed
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+// CHECK-SAME: parent_task_id=[[IMPLICIT_TASK_ID]],
+// CHECK-SAME: has_dependences=yes
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+// CHECK-SAME: parent_task_id=[[IMPLICIT_TASK_ID]],
+// CHECK-SAME: parent_task_frame.exit=0x{{[0-f]+}},
+// CHECK-SAME: parent_task_frame.reenter=0x{{[0-f]+}},
+// CHECK-SAME: new_task_id=[[TASK_ID:[0-9]+]],
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_task_schedule:
+// CHECK-SAME: first_task_id=[[IMPLICIT_TASK_ID]],
+// CHECK-SAME: second_task_id=[[TASK_ID]],
+// CHECK-SAME: prior_task_status=ompt_task_switch=7
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_task_schedule:
+// CHECK-SAME: first_task_id=[[TASK_ID]],
+// CHECK-SAME: second_task_id=[[IMPLICIT_TASK_ID]],
+// CHECK-SAME: prior_task_status=ompt_task_detach=4
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_task_schedule:
+// CHECK-SAME: first_task_id=[[TASK_ID]],
+// CHECK-SAME: second_task_id=18446744073709551615,
+// CHECK-SAME: prior_task_status=ompt_task_late_fulfill=6
diff --git a/runtime/test/ompt/tasks/task_memory.c b/runtime/test/ompt/tasks/task_memory.c
index a48cef22b..fabb3c5eb 100644
--- a/runtime/test/ompt/tasks/task_memory.c
+++ b/runtime/test/ompt/tasks/task_memory.c
@@ -41,6 +41,9 @@ static void on_ompt_callback_implicit_task(ompt_scope_endpoint_t endpoint,
            ", memory_addr=%p, memory_size=%lu, result=%d \n",
            task_data->value, addr, size, result);
     break;
+  case ompt_scope_beginend:
+    printf("ompt_scope_beginend should never be passed to %s\n", __func__);
+    exit(-1);
   }
 }
 
@@ -96,7 +99,7 @@ ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
 // CHECK-SAME: memory_addr=[[NULL]], memory_size=0, result=0
 
 // CHECK: ompt_event_task_create: task_id=[[TASK_ID_0:[0-9]+]]
-// CHECK: ompt_event_task_create: task_id=[[TASK_ID_1:[0-9]+]]
+// CHECK-DAG: ompt_event_task_create: task_id=[[TASK_ID_1:[0-9]+]]
 
 // Expects non-zero address, size, and result
 // CHECK-DAG: ompt_event_task_schedule: task_id=[[TASK_ID_0]],
diff --git a/runtime/test/ompt/tasks/taskwait-depend.c b/runtime/test/ompt/tasks/taskwait-depend.c
new file mode 100644
index 000000000..e62ad70f2
--- /dev/null
+++ b/runtime/test/ompt/tasks/taskwait-depend.c
@@ -0,0 +1,82 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+
+// taskwait with depend clause was introduced with gcc-9
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7, gcc-8
+
+// clang does not yet support taskwait with depend clause
+// clang-12 introduced parsing, but no codegen
+// update expected result when codegen in clang was added
+// XFAIL: clang
+
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+  int x = 0;
+#pragma omp parallel num_threads(2)
+  {
+#pragma omp master
+    {
+      print_ids(0);
+      printf("%" PRIu64 ": address of x: %p\n", ompt_get_thread_data()->value,
+             &x);
+#pragma omp task depend(out : x)
+      { x++; }
+      print_fuzzy_address(1);
+      #pragma omp taskwait depend(in: x)
+      print_fuzzy_address(2);
+    }
+  }
+
+  return 0;
+}
+
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_dependences'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_depende
+
+// CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+// make sure initial data pointers are null
+// CHECK-NOT: 0: new_task_data initially not null
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]],
+// CHECK-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+
+// CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// CHECK-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT:0x[0-f]+]],
+// CHECK-SAME: reenter_frame=[[NULL]]
+
+// CHECK: {{^}}[[MASTER_ID]]: address of x: [[ADDRX:0x[0-f]+]]
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+// CHECK-SAME: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[EXIT]],
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: new_task_id=[[FIRST_TASK:[0-f]+]],
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}},
+// CHECK-SAME: task_type=ompt_task_explicit=4, has_dependences=yes
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_dependences:
+// CHECK-SAME: task_id=[[FIRST_TASK]], deps=[([[ADDRX]],
+// CHECK-SAME: ompt_dependence_type_inout)], ndeps=1
+
+// CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+// CHECK-SAME: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[EXIT]],
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: new_task_id=[[SECOND_TASK:[0-f]+]],
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}},
+// CHECK-SAME: task_type=ompt_task_explicit|ompt_task_undeferred|
+// CHECK-SAME: ompt_task_mergeable=1207959556, has_dependences=yes
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_dependences:
+// CHECK-SAME: task_id=[[SECOND_TASK]], deps=[([[ADDRX]],
+// CHECK-SAME: ompt_dependence_type_in)], ndeps=1
+
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_end: task_id=[[SECOND_TASK]]
+
+// CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
diff --git a/runtime/test/ompt/teams/parallel_team.c b/runtime/test/ompt/teams/parallel_team.c
index 15d9b6c24..6d8ec7e46 100644
--- a/runtime/test/ompt/teams/parallel_team.c
+++ b/runtime/test/ompt/teams/parallel_team.c
@@ -1,6 +1,6 @@
 // RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
-// REQUIRES: ompt
-// UNSUPPORTED: gcc
+// REQUIRES: ompt, multicpu
+// UNSUPPORTED: gcc, icc-19
 #include "callback.h"
 
 int main() {
diff --git a/runtime/test/ompt/teams/serial_teams.c b/runtime/test/ompt/teams/serial_teams.c
index 64d0c8958..439aff29b 100644
--- a/runtime/test/ompt/teams/serial_teams.c
+++ b/runtime/test/ompt/teams/serial_teams.c
@@ -1,6 +1,6 @@
 // RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
-// REQUIRES: ompt
-// UNSUPPORTED: gcc
+// REQUIRES: ompt, multicpu
+// UNSUPPORTED: gcc, icc-19
 #include "callback.h"
 
 int main() {
diff --git a/runtime/test/ompt/teams/serialized.c b/runtime/test/ompt/teams/serialized.c
index 4edd422bf..40bc14dfe 100644
--- a/runtime/test/ompt/teams/serialized.c
+++ b/runtime/test/ompt/teams/serialized.c
@@ -1,6 +1,6 @@
 // RUN: %libomp-compile-and-run | FileCheck %s
 // REQUIRES: ompt
-// UNSUPPORTED: gcc
+// UNSUPPORTED: gcc, icc-19
 #include "callback.h"
 
 int main() {
diff --git a/runtime/test/ompt/teams/team.c b/runtime/test/ompt/teams/team.c
index 3aa85ea62..2ce5101c4 100644
--- a/runtime/test/ompt/teams/team.c
+++ b/runtime/test/ompt/teams/team.c
@@ -1,6 +1,6 @@
 // RUN: %libomp-compile-and-run | FileCheck %s
 // REQUIRES: ompt
-// UNSUPPORTED: gcc
+// UNSUPPORTED: gcc, icc-19
 #include "callback.h"
 
 int main() {
diff --git a/runtime/test/parallel/omp_parallel_copyin.c b/runtime/test/parallel/omp_parallel_copyin.c
index 600f9b72c..a9ffb09fe 100644
--- a/runtime/test/parallel/omp_parallel_copyin.c
+++ b/runtime/test/parallel/omp_parallel_copyin.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// REQUIRES: !(abt && (clang || gcc))
 #include <stdio.h>
 #include <stdlib.h>
 #include "omp_testsuite.h"
diff --git a/runtime/test/tasking/bug_nested_proxy_task.c b/runtime/test/tasking/bug_nested_proxy_task.c
index f70e9044a..90b4834e5 100644
--- a/runtime/test/tasking/bug_nested_proxy_task.c
+++ b/runtime/test/tasking/bug_nested_proxy_task.c
@@ -1,6 +1,11 @@
 // RUN: %libomp-compile-and-run
 // The runtime currently does not get dependency information from GCC.
 // UNSUPPORTED: gcc
+// REQUIRES: !abt
+
+// Very flaky on openmp-clang-x86_64-linux-debian.
+// https://bugs.llvm.org/show_bug.cgi?id=45397
+// UNSUPPORTED: linux
 
 #include <stdio.h>
 #include <omp.h>
diff --git a/runtime/test/tasking/bug_proxy_task_dep_waiting.c b/runtime/test/tasking/bug_proxy_task_dep_waiting.c
index c07f399d2..5cc688008 100644
--- a/runtime/test/tasking/bug_proxy_task_dep_waiting.c
+++ b/runtime/test/tasking/bug_proxy_task_dep_waiting.c
@@ -1,6 +1,11 @@
 // RUN: %libomp-compile-and-run
 // The runtime currently does not get dependency information from GCC.
-// UNSUPPORTED: gcc
+// UNSUPPORTED: gcc, icc-16
+// REQUIRES: !abt
+
+// Very flaky on openmp-clang-x86_64-linux-debian.
+// https://bugs.llvm.org/show_bug.cgi?id=45397
+// UNSUPPORTED: linux
 
 #include <stdio.h>
 #include <omp.h>
diff --git a/runtime/test/tasking/kmp_detach_tasks_t1.c b/runtime/test/tasking/kmp_detach_tasks_t1.c
index f1763ec16..c0d3b45da 100644
--- a/runtime/test/tasking/kmp_detach_tasks_t1.c
+++ b/runtime/test/tasking/kmp_detach_tasks_t1.c
@@ -1,5 +1,6 @@
 // RUN: %libomp-compile && env OMP_NUM_THREADS='3' %libomp-run
 // RUN: %libomp-compile && env OMP_NUM_THREADS='1' %libomp-run
+// REQUIRES: !abt
 
 #include <stdio.h>
 #include <omp.h>
diff --git a/runtime/test/tasking/kmp_detach_tasks_t2.c b/runtime/test/tasking/kmp_detach_tasks_t2.c
index 66fcb8fbb..f63f5eb49 100644
--- a/runtime/test/tasking/kmp_detach_tasks_t2.c
+++ b/runtime/test/tasking/kmp_detach_tasks_t2.c
@@ -1,5 +1,6 @@
 // RUN: %libomp-compile && env OMP_NUM_THREADS='3' %libomp-run
 // RUN: %libomp-compile && env OMP_NUM_THREADS='1' %libomp-run
+// REQUIRES: !abt
 
 #include <stdio.h>
 #include <omp.h>
diff --git a/runtime/test/tasking/kmp_detach_tasks_t3.c b/runtime/test/tasking/kmp_detach_tasks_t3.c
index e14bab60b..548584dfd 100644
--- a/runtime/test/tasking/kmp_detach_tasks_t3.c
+++ b/runtime/test/tasking/kmp_detach_tasks_t3.c
@@ -2,6 +2,7 @@
 // RUN: %libomp-compile && env OMP_NUM_THREADS='1' %libomp-run
 // The runtime currently does not get dependency information from GCC.
 // UNSUPPORTED: gcc
+// REQUIRES: !abt
 
 #include <stdio.h>
 #include <omp.h>
diff --git a/runtime/test/tasking/kmp_task_modifier_simple_par_new.cpp b/runtime/test/tasking/kmp_task_modifier_simple_par_new.cpp
index f2dea9d7b..9812d602d 100644
--- a/runtime/test/tasking/kmp_task_modifier_simple_par_new.cpp
+++ b/runtime/test/tasking/kmp_task_modifier_simple_par_new.cpp
@@ -39,7 +39,7 @@ typedef struct red_input {
   void *reduce_orig; /**< original reduction item used for initialization */
   size_t reduce_size; /**< size of data item in bytes */
   // three compiler-generated routines (init, fini are optional):
-  void *reduce_init; /**< data initialization routine (single paramemter) */
+  void *reduce_init; /**< data initialization routine (single parameter) */
   void *reduce_fini; /**< data finalization routine */
   void *reduce_comb; /**< data combiner routine */
   unsigned flags; /**< flags for additional info from compiler */
diff --git a/runtime/test/tasking/kmp_task_modifier_simple_par_old.cpp b/runtime/test/tasking/kmp_task_modifier_simple_par_old.cpp
index 2526d4e9d..94e9bbca5 100644
--- a/runtime/test/tasking/kmp_task_modifier_simple_par_old.cpp
+++ b/runtime/test/tasking/kmp_task_modifier_simple_par_old.cpp
@@ -36,7 +36,7 @@ typedef struct red_input {
   void *reduce_shar; /**< shared between tasks item to reduce into */
   size_t reduce_size; /**< size of data item in bytes */
   // three compiler-generated routines (init, fini are optional):
-  void *reduce_init; /**< data initialization routine (single paramemter) */
+  void *reduce_init; /**< data initialization routine (single parameter) */
   void *reduce_fini; /**< data finalization routine */
   void *reduce_comb; /**< data combiner routine */
   unsigned flags; /**< flags for additional info from compiler */
diff --git a/runtime/test/tasking/kmp_task_modifier_simple_ws_new.cpp b/runtime/test/tasking/kmp_task_modifier_simple_ws_new.cpp
index e66cda91a..29d86e30a 100644
--- a/runtime/test/tasking/kmp_task_modifier_simple_ws_new.cpp
+++ b/runtime/test/tasking/kmp_task_modifier_simple_ws_new.cpp
@@ -39,7 +39,7 @@ typedef struct red_input {
   void *reduce_orig; /**< original reduction item used for initialization */
   size_t reduce_size; /**< size of data item in bytes */
   // three compiler-generated routines (init, fini are optional):
-  void *reduce_init; /**< data initialization routine (single paramemter) */
+  void *reduce_init; /**< data initialization routine (single parameter) */
   void *reduce_fini; /**< data finalization routine */
   void *reduce_comb; /**< data combiner routine */
   unsigned flags; /**< flags for additional info from compiler */
diff --git a/runtime/test/tasking/kmp_task_modifier_simple_ws_old.cpp b/runtime/test/tasking/kmp_task_modifier_simple_ws_old.cpp
index 97d5cb5d9..3e0ce26f3 100644
--- a/runtime/test/tasking/kmp_task_modifier_simple_ws_old.cpp
+++ b/runtime/test/tasking/kmp_task_modifier_simple_ws_old.cpp
@@ -36,7 +36,7 @@ typedef struct red_input {
   void *reduce_shar; /**< shared between tasks item to reduce into */
   size_t reduce_size; /**< size of data item in bytes */
   // three compiler-generated routines (init, fini are optional):
-  void *reduce_init; /**< data initialization routine (single paramemter) */
+  void *reduce_init; /**< data initialization routine (single parameter) */
   void *reduce_fini; /**< data finalization routine */
   void *reduce_comb; /**< data combiner routine */
   unsigned flags; /**< flags for additional info from compiler */
diff --git a/runtime/test/tasking/kmp_taskloop_5.c b/runtime/test/tasking/kmp_taskloop_5.c
new file mode 100644
index 000000000..aca0e7565
--- /dev/null
+++ b/runtime/test/tasking/kmp_taskloop_5.c
@@ -0,0 +1,167 @@
+// RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
+
+#include <stdio.h>
+#include <omp.h>
+#include "omp_my_sleep.h"
+
+#define N 4
+#define ST 3
+#define UB 118
+#define LB 0
+
+// globals
+int counter;
+int task_count;
+
+// Compiler-generated code (emulation)
+typedef struct ident {
+  void* dummy;
+} ident_t;
+
+typedef struct shar {
+  int *pcounter;
+  int *pj;
+  int *ptask_count;
+} *pshareds;
+
+typedef struct task {
+  pshareds shareds;
+  int(* routine)(int,struct task*);
+  int part_id;
+  unsigned long long lb; // library always uses ULONG
+  unsigned long long ub;
+  int st;
+  int last;
+  int i;
+  int j;
+  int th;
+} *ptask, kmp_task_t;
+
+typedef int(* task_entry_t)( int, ptask );
+
+void
+__task_dup_entry(ptask task_dst, ptask task_src, int lastpriv)
+{
+// setup lastprivate flag
+  task_dst->last = lastpriv;
+// could be constructor calls here...
+}
+
+// OpenMP RTL interfaces
+typedef unsigned long long kmp_uint64;
+typedef long long kmp_int64;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void
+__kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
+                  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                  int nogroup, int sched, kmp_int64 grainsize, int modifier,
+                  void *task_dup);
+ptask
+__kmpc_omp_task_alloc(ident_t *loc, int gtid, int flags,
+                      size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                      task_entry_t task_entry);
+void __kmpc_atomic_fixed4_add(void *id_ref, int gtid, int * lhs, int rhs);
+int  __kmpc_global_thread_num(void *id_ref);
+#ifdef __cplusplus
+}
+#endif
+
+// User's code
+int task_entry(int gtid, ptask task)
+{
+  pshareds pshar = task->shareds;
+  __kmpc_atomic_fixed4_add(NULL, gtid, pshar->ptask_count, 1);
+
+  for (task->i = task->lb; task->i <= (int)task->ub; task->i += task->st) {
+    task->th = omp_get_thread_num();
+    __kmpc_atomic_fixed4_add(NULL,gtid,pshar->pcounter,1);
+    task->j = task->i;
+  }
+  my_sleep( 0.1 ); // sleep 100 ms in order to allow other threads to steal tasks
+  if (task->last) {
+    *(pshar->pj) = task->j; // lastprivate
+  }
+  return 0;
+}
+
+void task_loop(int sched_type, int sched_val, int modifier)
+{
+  int i, j, gtid = __kmpc_global_thread_num(NULL);
+  ptask task;
+  pshareds psh;
+  omp_set_dynamic(0);
+  counter = 0;
+  task_count = 0;
+  #pragma omp parallel num_threads(N)
+  {
+    #pragma omp master
+    {
+      int gtid = __kmpc_global_thread_num(NULL);
+      task = __kmpc_omp_task_alloc(NULL, gtid, 1, sizeof(struct task),
+                                   sizeof(struct shar), &task_entry);
+      psh = task->shareds;
+      psh->pcounter = &counter;
+      psh->ptask_count = &task_count;
+      psh->pj = &j;
+      task->lb = LB;
+      task->ub = UB;
+      task->st = ST;
+
+      __kmpc_taskloop_5(
+        NULL,             // location
+        gtid,             // gtid
+        task,             // task structure
+        1,                // if clause value
+        &task->lb,        // lower bound
+        &task->ub,        // upper bound
+        ST,               // loop increment
+        0,                // 1 if nogroup specified
+        sched_type,       // schedule type: 0-none, 1-grainsize, 2-num_tasks
+        sched_val,        // schedule value (ignored for type 0)
+        modifier,         // strict modifier
+        (void*)&__task_dup_entry // tasks duplication routine
+      );
+    } // end master
+  } // end parallel
+// check results
+  int tc;
+  if (ST == 1) { // most common case
+    tc = UB - LB + 1;
+  } else if (ST < 0) {
+    tc = (LB - UB) / (-ST) + 1;
+  } else { // ST > 0
+    tc = (UB - LB) / ST + 1;
+  }
+  int count;
+  if (sched_type == 1) {
+    count = (sched_val > tc) ? 1 : (tc + sched_val - 1) / sched_val;
+  } else {
+    count = (sched_val > tc) ? tc : sched_val;
+  }
+  if (j != LB + (tc - 1) * ST) {
+    printf("Error in lastprivate, %d != %d\n", j, LB + (tc - 1) * ST);
+    exit(1);
+  }
+  if (counter != tc) {
+    printf("Error, counter %d != %d\n", counter, tc);
+    exit(1);
+  }
+  if (task_count != count) {
+    printf("Error, task count %d != %d\n", task_count, count);
+    exit(1);
+  }
+}
+
+int main(int argc, char *argv[]) {
+  task_loop(1, 6, 1); // create 7 tasks
+  task_loop(2, 6, 1); // create 6 tasks
+  task_loop(1, 50, 1); // create 1 task
+  task_loop(2, 50, 1); // create 40 tasks
+
+  printf("Test passed\n");
+  return 0;
+}
diff --git a/runtime/test/tasking/omp50_task_depend_mtx.c b/runtime/test/tasking/omp50_task_depend_mtx.c
index 79c270e94..0e9bb53b9 100644
--- a/runtime/test/tasking/omp50_task_depend_mtx.c
+++ b/runtime/test/tasking/omp50_task_depend_mtx.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// REQUIRES: !abt
 
 // Tests OMP 5.0 task dependences "mutexinoutset", emulates compiler codegen
 // Mutually exclusive tasks get same input dependency info array
diff --git a/runtime/test/tasking/omp50_task_depend_mtx2.c b/runtime/test/tasking/omp50_task_depend_mtx2.c
index ec8a7d1ca..2478401c0 100644
--- a/runtime/test/tasking/omp50_task_depend_mtx2.c
+++ b/runtime/test/tasking/omp50_task_depend_mtx2.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// REQUIRES: !abt
 
 // Tests OMP 5.0 task dependences "mutexinoutset", emulates compiler codegen
 // Mutually exclusive tasks get input dependency info array sorted differently
diff --git a/runtime/test/tasking/omp50_task_depend_mtx3.c b/runtime/test/tasking/omp50_task_depend_mtx3.c
new file mode 100644
index 000000000..aacfb420a
--- /dev/null
+++ b/runtime/test/tasking/omp50_task_depend_mtx3.c
@@ -0,0 +1,103 @@
+// RUN: %libomp-compile-and-run
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7, gcc-8
+// UNSUPPORTED: clang-3, clang-4, clang-5, clang-6, clang-7, clang-8
+// TODO: update expected result when icc supports mutexinoutset
+// XFAIL: icc
+// REQUIRES: !abt
+
+// Tests OMP 5.0 task dependences "mutexinoutset", emulates compiler codegen
+// Mutually exclusive tasks get same input dependency info array
+//
+// Task tree created:
+//      task0 task1
+//         \    / \
+//         task2   task5
+//           / \
+//       task3  task4
+//       /   \
+//  task6 <-->task7  (these two are mutually exclusive)
+//       \    /
+//       task8
+//
+#include <stdio.h>
+#include <omp.h>
+#include "omp_my_sleep.h"
+
+static int checker = 0; // to check if two tasks run simultaneously
+static int err = 0;
+#ifndef DELAY
+#define DELAY 0.1
+#endif
+
+int mutex_task(int task_id) {
+  int th = omp_get_thread_num();
+  #pragma omp atomic
+    ++checker;
+  printf("task %d, th %d\n", task_id, th);
+  if (checker != 1) {
+    err++;
+    printf("Error1, checker %d != 1\n", checker);
+  }
+  my_sleep(DELAY);
+  if (checker != 1) {
+    err++;
+    printf("Error2, checker %d != 1\n", checker);
+  }
+  #pragma omp atomic
+    --checker;
+  return 0;
+}
+
+int main()
+{
+  int i1,i2,i3,i4;
+  omp_set_num_threads(2);
+  #pragma omp parallel
+  {
+    #pragma omp single nowait
+    {
+      int t = omp_get_thread_num();
+      #pragma omp task depend(in: i1, i2)
+      { int th = omp_get_thread_num();
+        printf("task 0_%d, th %d\n", t, th);
+        my_sleep(DELAY); }
+      #pragma omp task depend(in: i1, i3)
+      { int th = omp_get_thread_num();
+        printf("task 1_%d, th %d\n", t, th);
+        my_sleep(DELAY); }
+      #pragma omp task depend(in: i2) depend(out: i1)
+      { int th = omp_get_thread_num();
+        printf("task 2_%d, th %d\n", t, th);
+        my_sleep(DELAY); }
+      #pragma omp task depend(in: i1)
+      { int th = omp_get_thread_num();
+        printf("task 3_%d, th %d\n", t, th);
+        my_sleep(DELAY); }
+      #pragma omp task depend(out: i2)
+      { int th = omp_get_thread_num();
+        printf("task 4_%d, th %d\n", t, th);
+        my_sleep(DELAY+0.1); } // wait a bit longer than task 3
+      #pragma omp task depend(out: i3)
+      { int th = omp_get_thread_num();
+        printf("task 5_%d, th %d\n", t, th);
+        my_sleep(DELAY); }
+
+      #pragma omp task depend(mutexinoutset: i1, i4)
+      { mutex_task(6); }
+      #pragma omp task depend(mutexinoutset: i1, i4)
+      { mutex_task(7); }
+
+      #pragma omp task depend(in: i1)
+      { int th = omp_get_thread_num();
+        printf("task 8_%d, th %d\n", t, th);
+        my_sleep(DELAY); }
+    } // single
+  } // parallel
+  if (err == 0) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed\n");
+    return 1;
+  }
+}
diff --git a/runtime/test/tasking/omp50_taskwait_depend.c b/runtime/test/tasking/omp50_taskwait_depend.c
new file mode 100644
index 000000000..5a6b42775
--- /dev/null
+++ b/runtime/test/tasking/omp50_taskwait_depend.c
@@ -0,0 +1,109 @@
+// RUN: %libomp-compile-and-run
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7, gcc-8
+// clang does not yet support taskwait with depend clause
+// clang-12 introduced parsing, but no codegen
+// TODO: update expected result when codegen in clang is added
+// icc does not yet support taskwait with depend clause
+// TODO: update expected result when support for icc is added
+// XFAIL: clang, icc
+// REQUIRES: !abt
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+#include "omp_my_sleep.h"
+
+int a = 0, b = 0;
+int task_grabbed = 0, task_can_proceed = 0;
+int task2_grabbed = 0, task2_can_proceed = 0;
+
+static void wait_on_flag(int *flag) {
+  int flag_value;
+  int timelimit = 30;
+  int secs = 0;
+  do {
+    #pragma omp atomic read
+    flag_value = *flag;
+    my_sleep(1.0);
+    secs++;
+    if (secs == timelimit) {
+      fprintf(stderr, "error: timeout in wait_on_flag()\n");
+      exit(EXIT_FAILURE);
+    }
+  } while (flag_value == 0);
+}
+
+static void signal_flag(int *flag) {
+  #pragma omp atomic
+  (*flag)++;
+}
+
+int main(int argc, char** argv) {
+
+  // Ensure two threads are running
+  int num_threads = omp_get_max_threads();
+  if (num_threads < 2)
+    omp_set_num_threads(2);
+
+  #pragma omp parallel shared(a)
+  {
+    int a_value;
+    // Let us be extra safe here
+    if (omp_get_num_threads() > 1) {
+      #pragma omp single nowait
+      {
+        // Schedule independent child task that
+        // waits to be flagged after sebsequent taskwait depend()
+        #pragma omp task
+        {
+          signal_flag(&task_grabbed);
+          wait_on_flag(&task_can_proceed);
+        }
+        // Let another worker thread grab the task to execute
+        wait_on_flag(&task_grabbed);
+        // This should be ignored since the task above has
+        // no dependency information
+        #pragma omp taskwait depend(inout: a)
+        // Signal the independent task to proceed
+        signal_flag(&task_can_proceed);
+
+        // Schedule child task with dependencies that taskwait does
+        // not care about
+        #pragma omp task depend(inout: b)
+        {
+          signal_flag(&task2_grabbed);
+          wait_on_flag(&task2_can_proceed);
+          #pragma omp atomic
+          b++;
+        }
+        // Let another worker thread grab the task to execute
+        wait_on_flag(&task2_grabbed);
+        // This should be ignored since the task above has
+        // dependency information on b instead of a
+        #pragma omp taskwait depend(inout: a)
+        // Signal the task to proceed
+        signal_flag(&task2_can_proceed);
+
+        // Generate one child task for taskwait
+        #pragma omp task shared(a) depend(inout: a)
+        {
+          my_sleep(1.0);
+          #pragma omp atomic
+          a++;
+        }
+        #pragma omp taskwait depend(inout: a)
+
+        #pragma omp atomic read
+        a_value = a;
+
+        if (a_value != 1) {
+          fprintf(stderr, "error: dependent task was not executed before "
+                          "taskwait finished\n");
+          exit(EXIT_FAILURE);
+        }
+      } // #pragma omp single
+    } // if (num_threads > 1)
+  } // #pragma omp parallel
+
+  return EXIT_SUCCESS;
+}
diff --git a/runtime/test/tasking/omp_detach_taskwait.c b/runtime/test/tasking/omp_detach_taskwait.c
new file mode 100644
index 000000000..2ecbd61bb
--- /dev/null
+++ b/runtime/test/tasking/omp_detach_taskwait.c
@@ -0,0 +1,27 @@
+// RUN: %libomp-compile -fopenmp-version=50 && env OMP_NUM_THREADS='3' %libomp-run
+// RUN: %libomp-compile -fopenmp-version=50 && env OMP_NUM_THREADS='1' %libomp-run
+
+// Checked gcc 10.1 still does not support detach clause on task construct.
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7, gcc-8, gcc-9, gcc-10
+// clang supports detach clause since version 11.
+// UNSUPPORTED: clang-10, clang-9, clang-8, clang-7
+// icc compiler does not support detach clause.
+// UNSUPPORTED: icc
+// REQUIRES: !abt
+
+#include <omp.h>
+
+int main()
+{
+  #pragma omp parallel
+  #pragma omp master
+  {
+    omp_event_handle_t event;
+    #pragma omp task detach(event)
+    {
+      omp_fulfill_event(event);
+    }
+    #pragma omp taskwait
+  }
+  return 0;
+}
diff --git a/runtime/test/tasking/omp_fill_taskqueue.c b/runtime/test/tasking/omp_fill_taskqueue.c
index 17e1de380..81313e41c 100644
--- a/runtime/test/tasking/omp_fill_taskqueue.c
+++ b/runtime/test/tasking/omp_fill_taskqueue.c
@@ -1,5 +1,6 @@
 // RUN: %libomp-compile && env KMP_ENABLE_TASK_THROTTLING=0 %libomp-run
 // RUN: %libomp-compile && env KMP_ENABLE_TASK_THROTTLING=1 %libomp-run
+// REQUIRES: !abt && !icc
 
 #include<omp.h>
 #include<stdlib.h>
@@ -47,10 +48,14 @@ int main()
           // all tasks, and detect the test failure if it has not been done yet.
           if (failed < 0)
             failed = throttling ? enqueued == NUM_TASKS : enqueued < NUM_TASKS;
+#pragma omp atomic write
           block = 0;
         }
-        while (block)
-          ;
+        int wait = 0;
+        do {
+#pragma omp atomic read
+          wait = block;
+        } while (wait);
       }
     }
     block = 0;
diff --git a/runtime/test/tasking/omp_task_depend.c b/runtime/test/tasking/omp_task_depend.c
new file mode 100644
index 000000000..b1ac9f2b8
--- /dev/null
+++ b/runtime/test/tasking/omp_task_depend.c
@@ -0,0 +1,91 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+int calc_seq(int n) {
+  int i, j, ret;
+  int *buffer = (int *)malloc(sizeof(int) * n * n);
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < n; j++) {
+      if (i == 0 && j == 0) {
+        buffer[i * n + j] = 1;
+      } else if (i == 0) {
+        buffer[i * n + j] = buffer[i * n + (j - 1)];
+      } else if (j == 0) {
+        buffer[i * n + j] = buffer[(i - 1) * n + j];
+      } else {
+        buffer[i * n + j] = buffer[(i - 1) * n + j] + buffer[i * n + (j - 1)];
+      }
+    }
+  }
+  ret = buffer[(n - 1) * n + (n - 1)];
+  free(buffer);
+  return ret;
+}
+
+int main()
+{
+  int r;
+  int n = 5;
+  int num_failed=0;
+
+  for(r = 0; r < REPETITIONS; r++) {
+    int seq_val, task_val;
+    #pragma omp parallel shared(task_val) firstprivate(n)
+    #pragma omp master
+    {
+      int i, j;
+      int *A_buf = (int *)malloc(sizeof(int) * n * n);
+      int **A = (int **)malloc(sizeof(int *) * n);
+      for(i = 0; i < n; i++) {
+        A[i] = A_buf + (i * n);
+        for(j = 0; j < n; j++) {
+          // Assign random values.
+          A[i][j] = i * n + j;
+        }
+      }
+      // A[i][j] is the root task.
+      for(i = 0; i < n; i++) {
+        for(j = 0; j < n; j++) {
+          if (i == 0 && j == 0) {
+            #pragma omp task depend(out:A[i][j]) firstprivate(A, i, j)
+            {
+              A[i][j] = 1;
+            }
+          } else if (i == 0) {
+            #pragma omp task depend(in:A[i][j - 1]) depend(out:A[i][j])\
+                             firstprivate(A, i, j)
+            {
+              A[i][j] = A[i][j - 1];
+            }
+          } else if (j == 0) {
+            #pragma omp task depend(in:A[i - 1][j]) depend(out:A[i][j])\
+                             firstprivate(A, i, j)
+            {
+              A[i][j] = A[i - 1][j];
+            }
+          } else {
+            #pragma omp task depend(in:A[i - 1][j], A[i][j - 1])\
+                             depend(out:A[i][j])
+            {
+              A[i][j] = A[i - 1][j] + A[i][j - 1];
+            }
+          }
+        }
+      }
+      #pragma omp taskwait
+      task_val = A[n - 1][n - 1];
+      free(A);
+      free(A_buf);
+    }
+
+    seq_val = calc_seq(n);
+    if(seq_val != task_val) {
+      printf("[%d] Failed: route(%d) = %d (ANS = %d)\n", r, n, task_val,
+             seq_val);
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/tasking/omp_task_depend_resize_hashmap.c b/runtime/test/tasking/omp_task_depend_resize_hashmap.c
index 03f12b06d..58cca562f 100644
--- a/runtime/test/tasking/omp_task_depend_resize_hashmap.c
+++ b/runtime/test/tasking/omp_task_depend_resize_hashmap.c
@@ -1,5 +1,13 @@
 // RUN: %libomp-compile && env KMP_ENABLE_TASK_THROTTLING=0 %libomp-run
 
+// This test is known to be fragile on NetBSD kernel at the moment,
+// https://bugs.llvm.org/show_bug.cgi?id=42020.
+// UNSUPPORTED: netbsd
+
+// Very flaky on openmp-clang-x86_64-linux-debian.
+// https://bugs.llvm.org/show_bug.cgi?id=45397
+// UNSUPPORTED: linux
+
 #include<omp.h>
 #include<stdlib.h>
 #include<string.h>
diff --git a/runtime/test/tasking/omp_task_nest_tied.c b/runtime/test/tasking/omp_task_nest_tied.c
new file mode 100644
index 000000000..3cdac7d3e
--- /dev/null
+++ b/runtime/test/tasking/omp_task_nest_tied.c
@@ -0,0 +1,57 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int fib(int n) {
+  int a, b;
+  if (n < 2) {
+    return n;
+  } else {
+    if(n < 4) {
+      return fib(n - 1) + fib(n - 2);
+    } else {
+      #pragma omp task shared(a)
+      {
+        a = fib(n - 1);
+      }
+      #pragma omp task shared(b)
+      {
+        b = fib(n - 2);
+      }
+      #pragma omp taskwait
+      return a + b;
+    }
+  }
+}
+
+int fib_seq(int n) {
+  int a, b;
+  if (n < 2) {
+    return n;
+  } else {
+    a = fib_seq(n - 1);
+    b = fib_seq(n - 2);
+    return a + b;
+  }
+}
+
+int main() {
+  int i;
+  int n = 20;
+  int num_failed = 0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    int task_val = 0;
+    int seq_val = fib_seq(n);
+    #pragma omp parallel shared(task_val) firstprivate(n)
+    #pragma omp master
+    {
+      task_val = fib(n);
+    }
+    if(seq_val != task_val) {
+      printf("[%d] Failed: fib(%d) = %d (ANS = %d)\n", i, n, task_val, seq_val);
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/tasking/omp_task_nest_untied.c b/runtime/test/tasking/omp_task_nest_untied.c
new file mode 100644
index 000000000..5feeafd68
--- /dev/null
+++ b/runtime/test/tasking/omp_task_nest_untied.c
@@ -0,0 +1,58 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int fib(int n) {
+  int a, b;
+  if (n < 2) {
+    return n;
+  } else {
+    if(n < 4) {
+      return fib(n - 1) + fib(n - 2);
+    } else {
+      #pragma omp task shared(a) untied
+      {
+        a = fib(n - 1);
+      }
+      #pragma omp task shared(b) untied
+      {
+        b = fib(n - 2);
+      }
+      #pragma omp taskwait
+      return a + b;
+    }
+  }
+}
+
+int fib_seq(int n) {
+  int a, b;
+  if (n < 2) {
+    return n;
+  } else {
+    a = fib_seq(n - 1);
+    b = fib_seq(n - 2);
+    return a + b;
+  }
+}
+
+int main() {
+  int i;
+  int n = 20;
+  int num_failed = 0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    int task_val = 0;
+    int seq_val = fib_seq(n);
+    #pragma omp parallel shared(task_val) firstprivate(n)
+    #pragma omp master
+    {
+      task_val = fib(n);
+    }
+    if(seq_val != task_val) {
+      printf("[%d] Failed: fib(%d) = %d (ANS = %d)\n", i, n, task_val, seq_val);
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
+
diff --git a/runtime/test/tasking/omp_task_red_taskloop.c b/runtime/test/tasking/omp_task_red_taskloop.c
new file mode 100644
index 000000000..0a595e4c6
--- /dev/null
+++ b/runtime/test/tasking/omp_task_red_taskloop.c
@@ -0,0 +1,69 @@
+// RUN: %libomp-compile-and-run
+
+// Parsing error until gcc8:
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7, gcc-8
+
+// Parsing error until clang11:
+// UNSUPPORTED: clang-10, clang-9, clang-8, clang-7
+
+// Missing GOMP_taskgroup_reduction_(un)register in LLVM/OpenMP
+// Should be removed once the functions are implemented
+// XFAIL: gcc-9, gcc-10
+// UNSUPPORTED: icc-19
+// REQUIRES: !abt
+
+#include <stdio.h>
+#include <omp.h>
+
+int r;
+
+int work(int k, int l)
+{
+  return k + l + 1;
+}
+void bar(int i) {
+  #pragma omp taskgroup task_reduction(+:r)
+ { int th_gen = omp_get_thread_num();
+  #pragma omp task in_reduction(+:r) firstprivate(i, th_gen)
+  {
+    r += work(i, 0);
+printf("executing task (%d, 0), th %d (gen by th %d)\n", i, omp_get_thread_num(), th_gen);
+  }
+  #pragma omp task in_reduction(+:r) firstprivate(i, th_gen)
+  {
+    r += work(i, 1);
+printf("executing task (%d, 1), th %d (gen by th %d)\n", i, omp_get_thread_num(), th_gen);
+  }
+ }
+}
+int foo() {
+  int i;
+  int th_gen = omp_get_thread_num();
+  #pragma omp taskgroup task_reduction(+:r)
+  {
+    bar(0);
+  }
+printf("th %d passed bar0\n", th_gen);
+  #pragma omp taskloop reduction(+:r) firstprivate(th_gen)
+  for (i = 1; i < 4; ++i) {
+    bar(i);
+printf("th %d (gen by th %d) passed bar%d in taskloop\n", omp_get_thread_num(), th_gen, i);
+  #pragma omp task in_reduction(+:r)
+    r += i;
+  }
+  return 0;
+}
+// res = 2*((1+2)+(2+3)+(3+4)+(4+5)+1+2+3) = 60
+#define res 60
+int main()
+{
+  r = 0;
+  #pragma omp parallel num_threads(2)
+    foo();
+  if (r == res) {
+    return 0;
+  } else {
+    printf("error r = %d (!= %d)\n", r, res);
+    return 1;
+  }
+}
diff --git a/runtime/test/tasking/omp_taskloop_grainsize.c b/runtime/test/tasking/omp_taskloop_grainsize.c
index 0833073ef..f1812bd7d 100644
--- a/runtime/test/tasking/omp_taskloop_grainsize.c
+++ b/runtime/test/tasking/omp_taskloop_grainsize.c
@@ -8,7 +8,7 @@
 
 /*
  * Test for taskloop
- * Method: caculate how many times the iteration space is dispatched
+ * Method: calculate how many times the iteration space is dispatched
  *     and judge if each dispatch has the requested grainsize
  * It is possible for two adjacent chunks are executed by the same thread
  */
diff --git a/runtime/test/tasking/omp_taskloop_num_tasks.c b/runtime/test/tasking/omp_taskloop_num_tasks.c
index 75cc337aa..1999cb57d 100644
--- a/runtime/test/tasking/omp_taskloop_num_tasks.c
+++ b/runtime/test/tasking/omp_taskloop_num_tasks.c
@@ -12,7 +12,7 @@
 
 /*
  * Test for taskloop
- * Method: caculate how many times the iteration space is dispatched
+ * Method: calculate how many times the iteration space is dispatched
  *     and judge if each dispatch has the requested grainsize
  * It is possible for two adjacent chunks are executed by the same thread
  */
diff --git a/runtime/test/tasking/omp_taskloop_taskwait.c b/runtime/test/tasking/omp_taskloop_taskwait.c
new file mode 100644
index 000000000..6cb226461
--- /dev/null
+++ b/runtime/test/tasking/omp_taskloop_taskwait.c
@@ -0,0 +1,30 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <omp.h>
+int main()
+{
+  enum {ITERS = 500};
+  enum {SIZE = 5};
+  int err = 0;
+  #pragma omp parallel num_threads(2) reduction(+:err)
+  {
+    int r = 0;
+    int i;
+    #pragma omp taskloop grainsize(SIZE) shared(r) nogroup
+    for(i=0; i<ITERS; i++) {
+      #pragma omp atomic
+        ++r;
+    }
+    #pragma omp taskwait
+    printf("%d\n", r);
+    if (r != ITERS)
+      err++;
+  } // end of parallel
+  if (err != 0) {
+    printf("failed, err = %d\n", err);
+    return 1;
+  } else {
+    printf("passed\n");
+    return 0;
+  }
+}
diff --git a/runtime/test/tasking/omp_taskyield_tied.c b/runtime/test/tasking/omp_taskyield_tied.c
new file mode 100644
index 000000000..b095d088b
--- /dev/null
+++ b/runtime/test/tasking/omp_taskyield_tied.c
@@ -0,0 +1,58 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_taskyield_tied()
+{
+  int i;
+  int count = 0;
+  int start_tid[NUM_TASKS];
+  int current_tid[NUM_TASKS];
+
+  for (i=0; i< NUM_TASKS; i++) {
+    start_tid[i]=0;
+    current_tid[i]=0;
+  }
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        int myi = i;
+        #pragma omp task
+        {
+          my_sleep(SLEEPTIME);
+          start_tid[myi] = omp_get_thread_num();
+          #pragma omp taskyield
+          if((start_tid[myi] %2) ==0){
+            my_sleep(SLEEPTIME);
+          } /*end of if*/
+          current_tid[myi] = omp_get_thread_num();
+        } /* end of omp task */
+      } /* end of for */
+    } /* end of single */
+  } /* end of parallel */
+  for (i=0;i<NUM_TASKS; i++) {
+    //printf("start_tid[%d]=%d, current_tid[%d]=%d\n",
+      //i, start_tid[i], i , current_tid[i]);
+    if (current_tid[i] == start_tid[i])
+      count++;
+  }
+  return (count == NUM_TASKS);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_taskyield_tied()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/tasking/taskdep_if0.c b/runtime/test/tasking/taskdep_if0.c
new file mode 100644
index 000000000..8133aecd6
--- /dev/null
+++ b/runtime/test/tasking/taskdep_if0.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile-and-run
+// REQUIRES: !abt
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+#include "omp_my_sleep.h"
+
+int a = 0;
+
+void task1() {
+  my_sleep(0.5);
+  a = 10;
+}
+
+void task2() {
+  a++;
+}
+
+int main(int argc, char** argv)
+{
+  #pragma omp parallel shared(argc) num_threads(2)
+  {
+    #pragma omp single
+    {
+      #pragma omp task depend(out: a)
+      task1();
+
+      #pragma omp task if(0) depend(inout: a)
+      task2();
+    }
+  }
+  if (a != 11) {
+    fprintf(stderr, "fail: expected 11, but a is %d\n", a);
+    exit(1);
+  } else {
+    printf("pass\n");
+  }
+  return 0;
+}
diff --git a/runtime/test/tasking/taskdep_if0_2.c b/runtime/test/tasking/taskdep_if0_2.c
new file mode 100644
index 000000000..a208291e2
--- /dev/null
+++ b/runtime/test/tasking/taskdep_if0_2.c
@@ -0,0 +1,105 @@
+// RUN: %libomp-compile-and-run
+// REQUIRES: !abt
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+#include "omp_my_sleep.h"
+
+int a = 0, b = 0;
+int task_grabbed = 0, task_can_proceed = 0;
+int task2_grabbed = 0, task2_can_proceed = 0;
+
+static void wait_on_flag(int *flag) {
+  int flag_value;
+  int timelimit = 30;
+  int secs = 0;
+  do {
+    #pragma omp atomic read
+    flag_value = *flag;
+    my_sleep(1.0);
+    secs++;
+    if (secs == timelimit) {
+      fprintf(stderr, "error: timeout in wait_on_flag()\n");
+      exit(EXIT_FAILURE);
+    }
+  } while (flag_value == 0);
+}
+
+static void signal_flag(int *flag) {
+  #pragma omp atomic
+  (*flag)++;
+}
+
+int main(int argc, char** argv) {
+
+  // Ensure two threads are running
+  int num_threads = omp_get_max_threads();
+  if (num_threads < 2)
+    omp_set_num_threads(2);
+
+  #pragma omp parallel shared(a)
+  {
+    int a_value;
+    // Let us be extra safe here
+    if (omp_get_num_threads() > 1) {
+      #pragma omp single nowait
+      {
+        // Schedule independent child task that
+        // waits to be flagged after sebsequent taskwait depend()
+        #pragma omp task
+        {
+          signal_flag(&task_grabbed);
+          wait_on_flag(&task_can_proceed);
+        }
+        // Let another worker thread grab the task to execute
+        wait_on_flag(&task_grabbed);
+        // This should be ignored since the task above has
+        // no dependency information
+        #pragma omp task if(0) depend(inout: a)
+        {}
+        // Signal the independent task to proceed
+        signal_flag(&task_can_proceed);
+
+        // Schedule child task with dependencies that taskwait does
+        // not care about
+        #pragma omp task depend(inout: b)
+        {
+          signal_flag(&task2_grabbed);
+          wait_on_flag(&task2_can_proceed);
+          #pragma omp atomic
+          b++;
+        }
+        // Let another worker thread grab the task to execute
+        wait_on_flag(&task2_grabbed);
+        // This should be ignored since the task above has
+        // dependency information on b instead of a
+        #pragma omp task if(0) depend(inout: a)
+        {}
+        // Signal the task to proceed
+        signal_flag(&task2_can_proceed);
+
+        // Generate one child task for taskwait
+        #pragma omp task shared(a) depend(inout: a)
+        {
+          my_sleep(1.0);
+          #pragma omp atomic
+          a++;
+        }
+        #pragma omp task if(0) depend(inout: a)
+        {}
+
+        #pragma omp atomic read
+        a_value = a;
+
+        if (a_value != 1) {
+          fprintf(stderr, "error: dependent task was not executed before "
+                          "taskwait finished\n");
+          exit(EXIT_FAILURE);
+        }
+      } // #pragma omp single
+    } // if (num_threads > 1)
+  } // #pragma omp parallel
+
+  return EXIT_SUCCESS;
+}
diff --git a/runtime/test/teams/teams.c b/runtime/test/teams/teams.c
new file mode 100644
index 000000000..bc009346a
--- /dev/null
+++ b/runtime/test/teams/teams.c
@@ -0,0 +1,57 @@
+// RUN: %libomp-compile-and-run
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7, gcc-8
+// UNSUPPORTED: icc, clang
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+#define NUM_TEAMS 2
+#define NUM_THREADS_PER_TEAM 3
+
+int main(int argc, char** argv) {
+  #pragma omp teams num_teams(NUM_TEAMS)
+  {
+    int i;
+    int members[NUM_THREADS_PER_TEAM];
+    // Only an upper bound is guaranteed for number of teams
+    int nteams = omp_get_num_teams();
+    if (nteams > NUM_TEAMS) {
+      fprintf(stderr, "error: too many teams: %d\n", nteams);
+      exit(1);
+    }
+    for (i = 0; i < NUM_THREADS_PER_TEAM; ++i)
+      members[i] = -1;
+    #pragma omp parallel num_threads(NUM_THREADS_PER_TEAM) private(i)
+    {
+      int tid = omp_get_thread_num();
+      int team_id = omp_get_team_num();
+      int nthreads = omp_get_num_threads();
+      if (nthreads != NUM_THREADS_PER_TEAM) {
+        fprintf(stderr, "error: detected number of threads (%d) is not %d\n",
+                nthreads, NUM_THREADS_PER_TEAM);
+        exit(1);
+      }
+      if (tid < 0 || tid >= nthreads) {
+        fprintf(stderr, "error: thread id is out of range: %d\n", tid);
+        exit(1);
+      }
+      if (team_id < 0 || team_id > omp_get_num_teams()) {
+        fprintf(stderr, "error: team id is out of range: %d\n", team_id);
+        exit(1);
+      }
+      members[omp_get_thread_num()] = 1;
+      #pragma omp barrier
+      #pragma omp single
+      {
+        for (i = 0; i < NUM_THREADS_PER_TEAM; ++i) {
+          if (members[i] != 1) {
+            fprintf(stderr, "error: worker %d not flagged\n", i);
+            exit(1);
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}
diff --git a/runtime/test/threadprivate/omp_threadprivate.c b/runtime/test/threadprivate/omp_threadprivate.c
index a3dd80d9d..26e547d87 100644
--- a/runtime/test/threadprivate/omp_threadprivate.c
+++ b/runtime/test/threadprivate/omp_threadprivate.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// REQUIRES: !(abt && (clang || gcc))
 /*
  * Threadprivate is tested in 2 ways:
  * 1. The global variable declared as threadprivate should have
@@ -62,7 +63,7 @@ int test_omp_threadprivate()
     my_random = rand(); /* random number generator is
                  called inside serial region*/
 
-    /* the first parallel region is used to initialiye myvalue
+    /* the first parallel region is used to initialize myvalue
        and the array with my_random+rank */
     #pragma omp parallel
     {
diff --git a/runtime/test/threadprivate/omp_threadprivate_for.c b/runtime/test/threadprivate/omp_threadprivate_for.c
index 3342e6342..4b360a431 100644
--- a/runtime/test/threadprivate/omp_threadprivate_for.c
+++ b/runtime/test/threadprivate/omp_threadprivate_for.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// REQUIRES: !(abt && (clang || gcc))
 #include "omp_testsuite.h"
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/runtime/test/worksharing/for/bug_set_schedule_0.c b/runtime/test/worksharing/for/bug_set_schedule_0.c
index 889e2393e..813b31c20 100644
--- a/runtime/test/worksharing/for/bug_set_schedule_0.c
+++ b/runtime/test/worksharing/for/bug_set_schedule_0.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+
 #include <stdio.h>
 #include <omp.h>
 #include "omp_testsuite.h"
diff --git a/runtime/test/worksharing/for/kmp_sch_simd_guided.c b/runtime/test/worksharing/for/kmp_sch_simd_guided.c
index 5c6f94bc7..39bcd4e79 100644
--- a/runtime/test/worksharing/for/kmp_sch_simd_guided.c
+++ b/runtime/test/worksharing/for/kmp_sch_simd_guided.c
@@ -6,6 +6,7 @@
 */
 #include <stdio.h>
 #include <omp.h>
+#include "omp_testsuite.h"
 
 #if defined(WIN32) || defined(_WIN32)
 #include <windows.h>
@@ -128,7 +129,7 @@ int run_loop_64(i64 loop_lb, i64 loop_ub, i64 loop_st, int loop_chunk) {
       // Guided scheduling uses FP computations, so current chunk may
       // be a bit bigger (+1) than allowed maximum
       if (!(cur <= max + 1)) {
-        printf("Error with iter %d, %d\n", cur, max);
+        printf("Error with iter %llu, %llu\n", cur, max);
         err++;
       }
       // Update maximum for the next chunk
@@ -176,9 +177,12 @@ int run_loop_64(i64 loop_lb, i64 loop_ub, i64 loop_st, int loop_chunk) {
       if (loop_sync != 0) {
         break;
       }; // if
+      if (!(i & (32 - 1)))
+        THREAD_SCHED_POINT();
     }; // for i
     while (loop_sync == 0) {
       delay();
+      THREAD_SCHED_POINT();
     }; // while
     // At this moment we do not have any more chunks -- all the chunks already
     // processed by master thread
@@ -280,7 +284,7 @@ int run_loop_32(int loop_lb, int loop_ub, int loop_st, int loop_chunk) {
       // Guided scheduling uses FP computations, so current chunk may
       // be a bit bigger (+1) than allowed maximum
       if (!(cur <= max + 1)) {
-        printf("Error with iter %d, %d\n", cur, max);
+        printf("Error with iter %llu, %llu\n", cur, max);
         err++;
       }
       // Update maximum for the next chunk
@@ -331,6 +335,7 @@ int run_loop_32(int loop_lb, int loop_ub, int loop_st, int loop_chunk) {
     }; // for i
     while (loop_sync == 0) {
       delay();
+      THREAD_SCHED_POINT();
     }; // while
     // At this moment we do not have any more chunks -- all the chunks already
     // processed by the master thread
diff --git a/runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c b/runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c
index 987a5c0d4..85c9af025 100644
--- a/runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c
+++ b/runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c
@@ -132,7 +132,7 @@ run_loop(
       // Guided scheduling uses FP computations, so current chunk may
       // be a bit bigger (+1) than allowed maximum.
       if (!( cur <= max + 1))
-        printf("Error with iter %d, %d, err %d\n", cur, max, ++err);
+        printf("Error with iter %llu, %llu, err %d\n", cur, max, ++err);
       // Update maximum for the next chunk.
       if (last) {
         if (!no_chunk && cur > ch && nthreads > 1)
diff --git a/runtime/test/worksharing/for/kmp_sch_simd_runtime_guided.c b/runtime/test/worksharing/for/kmp_sch_simd_runtime_guided.c
index 5dfaf2418..99b22ab20 100644
--- a/runtime/test/worksharing/for/kmp_sch_simd_runtime_guided.c
+++ b/runtime/test/worksharing/for/kmp_sch_simd_runtime_guided.c
@@ -140,7 +140,7 @@ run_loop(
       // Guided scheduling uses FP computations, so current chunk may
       // be a bit bigger (+1) than allowed maximum.
       if (!( cur <= max + 1))
-        printf("Error with iter %d, %d, err %d\n", cur, max, ++err);
+        printf("Error with iter %llu, %llu, err %d\n", cur, max, ++err);
       // Update maximum for the next chunk.
       if (!last && cur % ch)
         printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n",
diff --git a/runtime/test/worksharing/for/kmp_sch_simd_runtime_static.c b/runtime/test/worksharing/for/kmp_sch_simd_runtime_static.c
index d76046bac..2048c278e 100644
--- a/runtime/test/worksharing/for/kmp_sch_simd_runtime_static.c
+++ b/runtime/test/worksharing/for/kmp_sch_simd_runtime_static.c
@@ -5,6 +5,7 @@
 // in combination with OMP_SCHEDULE=static[,chunk]
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include <omp.h>
 
 #if defined(WIN32) || defined(_WIN32)
@@ -133,7 +134,7 @@ run_loop(
       // Guided scheduling uses FP computations, so current chunk may
       // be a bit bigger (+1) than allowed maximum.
       if (!( cur <= max + 1))
-        printf("Error with iter %d, %d, err %d\n", cur, max, ++err);
+        printf("Error with iter %llu, %llu, err %d\n", cur, max, ++err);
       // Update maximum for the next chunk.
       if (last) {
         if (!no_chunk && cur > ch && nthreads > 1)
diff --git a/runtime/test/worksharing/for/kmp_set_dispatch_buf.c b/runtime/test/worksharing/for/kmp_set_dispatch_buf.c
index a6378fee9..244acdc97 100644
--- a/runtime/test/worksharing/for/kmp_set_dispatch_buf.c
+++ b/runtime/test/worksharing/for/kmp_set_dispatch_buf.c
@@ -3,6 +3,7 @@
 // RUN: %libomp-run 1 && %libomp-run 2 && %libomp-run 5
 // RUN: %libomp-compile -DMY_SCHEDULE=guided && %libomp-run 7
 // RUN: %libomp-run 1 && %libomp-run 2 && %libomp-run 5
+// UNSUPPORTED: clang-11, clang-12
 #include <stdio.h>
 #include <omp.h>
 #include <stdlib.h>
diff --git a/runtime/test/worksharing/for/omp_for_bigbounds.c b/runtime/test/worksharing/for/omp_for_bigbounds.c
index 901d76083..adc00ff45 100644
--- a/runtime/test/worksharing/for/omp_for_bigbounds.c
+++ b/runtime/test/worksharing/for/omp_for_bigbounds.c
@@ -2,12 +2,14 @@
 // RUN: %libomp-compile -DMY_SCHEDULE=dynamic && %libomp-run
 // RUN: %libomp-compile -DMY_SCHEDULE=guided && %libomp-run
 
-// Only works with Intel Compiler since at least version 15.0
-// XFAIL: gcc, clang
+// Only works with Intel Compiler since at least version 15.0 and clang since
+// version 11.
+
+// XFAIL: gcc, clang-3, clang-4, clang-5, clang-6, clang-7, clang-8, clang-9, clang-10
 
 /*
  * Test that large bounds are handled properly and calculations of
- * loop iterations don't accidently overflow
+ * loop iterations don't accidentally overflow
  */
 #include <stdio.h>
 #include <omp.h>
diff --git a/runtime/test/worksharing/for/omp_for_collapse.c b/runtime/test/worksharing/for/omp_for_collapse.c
index a08086dcc..afac72b0a 100644
--- a/runtime/test/worksharing/for/omp_for_collapse.c
+++ b/runtime/test/worksharing/for/omp_for_collapse.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// REQUIRES: !abt
 #include <stdio.h>
 #include <math.h>
 #include "omp_testsuite.h"
diff --git a/runtime/test/worksharing/for/omp_for_collapse_mini.c b/runtime/test/worksharing/for/omp_for_collapse_mini.c
new file mode 100644
index 000000000..4ff82779e
--- /dev/null
+++ b/runtime/test/worksharing/for/omp_for_collapse_mini.c
@@ -0,0 +1,51 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/* Utility function to check that i is increasing monotonically
+   with each call */
+static int check_i_islarger (int i)
+{
+  static int last_i;
+  int islarger;
+  if (i==1)
+    last_i=0;
+  islarger = ((i >= last_i)&&(i - last_i<=1));
+  last_i = i;
+  return (islarger);
+}
+
+int test_omp_for_collapse()
+{
+  int is_larger = 1;
+
+  #pragma omp parallel
+  {
+    int i,j;
+    int my_islarger = 1;
+    #pragma omp for private(i,j) schedule(static,1) collapse(2) ordered
+    for (i = 1; i < 5; i++) {
+      for (j =1; j < 5; j++) {
+        #pragma omp ordered
+        my_islarger = check_i_islarger(i)&&my_islarger;
+      }
+    }
+    #pragma omp critical
+    is_larger = is_larger && my_islarger;
+  }
+  return (is_larger);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_collapse()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/worksharing/for/omp_for_firstprivate.c b/runtime/test/worksharing/for/omp_for_firstprivate.c
index 6c4121cce..255c22ed0 100644
--- a/runtime/test/worksharing/for/omp_for_firstprivate.c
+++ b/runtime/test/worksharing/for/omp_for_firstprivate.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// REQUIRES: !(abt && (clang || gcc))
 #include <stdio.h>
 #include <math.h>
 #include "omp_testsuite.h"
diff --git a/runtime/test/worksharing/for/omp_for_firstprivate_nothreadprivate.c b/runtime/test/worksharing/for/omp_for_firstprivate_nothreadprivate.c
new file mode 100644
index 000000000..a56f9149f
--- /dev/null
+++ b/runtime/test/worksharing/for/omp_for_firstprivate_nothreadprivate.c
@@ -0,0 +1,52 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int test_omp_for_firstprivate()
+{
+  int sum;
+  int sum0;
+  int known_sum;
+  int threadsnum;
+
+  sum = 0;
+  sum0 = 12345;
+
+  #pragma omp parallel
+  {
+    int sum1 = 0;
+    #pragma omp single
+    {
+      threadsnum=omp_get_num_threads();
+    }
+    /* sum0 = 0; */
+
+    int i;
+    #pragma omp for firstprivate(sum0)
+    for (i = 1; i <= LOOPCOUNT; i++) {
+      sum0 = sum0 + i;
+      sum1 = sum0;
+    }  /* end of for */
+
+    #pragma omp critical
+    {
+      sum = sum + sum1;
+    }  /* end of critical */
+  }  /* end of parallel */
+  known_sum = 12345* threadsnum+ (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_firstprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/worksharing/for/omp_for_lastprivate.c b/runtime/test/worksharing/for/omp_for_lastprivate.c
index 88694b816..e4aa99f45 100644
--- a/runtime/test/worksharing/for/omp_for_lastprivate.c
+++ b/runtime/test/worksharing/for/omp_for_lastprivate.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// REQUIRES: !(abt && (clang || gcc))
 #include <stdio.h>
 #include <math.h>
 #include "omp_testsuite.h"
diff --git a/runtime/test/worksharing/for/omp_for_lastprivate_nothreadprivate.c b/runtime/test/worksharing/for/omp_for_lastprivate_nothreadprivate.c
new file mode 100644
index 000000000..aa0249cce
--- /dev/null
+++ b/runtime/test/worksharing/for/omp_for_lastprivate_nothreadprivate.c
@@ -0,0 +1,49 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int test_omp_for_lastprivate()
+{
+  int sum = 0;
+  int known_sum;
+  int i0;
+
+  i0 = -1;
+
+  #pragma omp parallel
+  {
+    int sum0 = 0;
+    {  /* Begin of orphaned block */
+      int i;
+      #pragma omp for schedule(static,7) lastprivate(i0)
+      for (i = 1; i <= LOOPCOUNT; i++) {
+        sum0 = sum0 + i;
+        i0 = i;
+      }  /* end of for */
+    }  /* end of orphaned block */
+
+    #pragma omp critical
+    {
+      sum = sum + sum0;
+    }  /* end of critical */
+  }  /* end of parallel */
+
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  fprintf(stderr, "known_sum = %d , sum = %d\n",known_sum,sum);
+  fprintf(stderr, "LOOPCOUNT = %d , i0 = %d\n",LOOPCOUNT,i0);
+  return ((known_sum == sum) && (i0 == LOOPCOUNT));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_lastprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/worksharing/for/omp_for_nowait.c b/runtime/test/worksharing/for/omp_for_nowait.c
index 95a477538..a75c99f82 100644
--- a/runtime/test/worksharing/for/omp_for_nowait.c
+++ b/runtime/test/worksharing/for/omp_for_nowait.c
@@ -18,7 +18,8 @@ void wait_for_release_then_increment(int rank)
 {
   fprintf(stderr, "Thread nr %d enters first for construct"
     " and waits.\n", rank);
-  while (release == 0);
+  while (release == 0)
+    THREAD_SCHED_POINT();
   #pragma omp atomic
   count++;
 }
diff --git a/runtime/test/worksharing/for/omp_for_private.c b/runtime/test/worksharing/for/omp_for_private.c
index 1f537b90c..6bbffff72 100644
--- a/runtime/test/worksharing/for/omp_for_private.c
+++ b/runtime/test/worksharing/for/omp_for_private.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// REQUIRES: !(abt && (clang || gcc))
 #include <stdio.h>
 #include <math.h>
 #include "omp_testsuite.h"
diff --git a/runtime/test/worksharing/for/omp_for_private_nothreadprivate.c b/runtime/test/worksharing/for/omp_for_private_nothreadprivate.c
new file mode 100644
index 000000000..c507b1de8
--- /dev/null
+++ b/runtime/test/worksharing/for/omp_for_private_nothreadprivate.c
@@ -0,0 +1,60 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/* Utility function do spend some time in a loop */
+static void do_some_work()
+{
+  int i;
+  double sum = 0;
+  for(i = 0; i < 1000; i++){
+  sum += sqrt ((double) i);
+  }
+}
+
+int test_omp_for_private()
+{
+  int sum = 0;
+  int sum0;
+  int known_sum;
+
+  sum0 = 0;  /* setting (global) sum0 = 0 */
+
+  #pragma omp parallel
+  {
+    int sum1 = 0;  /* setting sum1 in each thread to 0 */
+    {  /* begin of orphaned block */
+      int i;
+      #pragma omp for private(sum0) schedule(static,1)
+      for (i = 1; i <= LOOPCOUNT; i++) {
+        sum0 = sum1;
+        #pragma omp flush
+        sum0 = sum0 + i;
+        do_some_work ();
+        #pragma omp flush
+        sum1 = sum0;
+      }
+    }  /* end of orphaned block */
+
+    #pragma omp critical
+    {
+      sum = sum + sum1;
+    }  /*end of critical*/
+  }  /* end of parallel*/
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_private()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/worksharing/for/omp_for_schedule_auto.c b/runtime/test/worksharing/for/omp_for_schedule_auto.c
index 075617c21..66366044f 100644
--- a/runtime/test/worksharing/for/omp_for_schedule_auto.c
+++ b/runtime/test/worksharing/for/omp_for_schedule_auto.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// REQUIRES: !(abt && (clang || gcc))
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
diff --git a/runtime/test/worksharing/for/omp_for_schedule_auto_nothreadprivate.c b/runtime/test/worksharing/for/omp_for_schedule_auto_nothreadprivate.c
new file mode 100644
index 000000000..655315d42
--- /dev/null
+++ b/runtime/test/worksharing/for/omp_for_schedule_auto_nothreadprivate.c
@@ -0,0 +1,66 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int test_omp_for_auto()
+{
+  int j;
+  int sum;
+  int sum0;
+  int known_sum;
+  int threadsnum;
+
+  sum = 0;
+  sum0 = 12345;
+
+  // array which keeps track of which threads participated in the for loop
+  // e.g., given 4 threads, [ 0 | 1 | 1 | 0 ] implies
+  //       threads 0 and 3 did not, threads 1 and 2 did
+  int max_threads = omp_get_max_threads();
+  int* active_threads = (int*)malloc(sizeof(int)*max_threads);
+  for(j = 0; j < max_threads; j++)
+    active_threads[j] = 0;
+
+  #pragma omp parallel
+  {
+    int i;
+    int sum1 = 0;
+    #pragma omp for firstprivate(sum0) schedule(auto)
+    for (i = 1; i <= LOOPCOUNT; i++) {
+      active_threads[omp_get_thread_num()] = 1;
+      sum0 = sum0 + i;
+      sum1 = sum0;
+    }
+
+    #pragma omp critical
+    {
+      sum = sum + sum1;
+    }
+  }
+
+  // count the threads that participated (sum is stored in threadsnum)
+  threadsnum=0;
+  for(j = 0; j < max_threads; j++) {
+    if(active_threads[j])
+      threadsnum++;
+  }
+  free(active_threads);
+
+  known_sum = 12345 * threadsnum + (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_auto()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/worksharing/for/omp_for_schedule_dynamic.c b/runtime/test/worksharing/for/omp_for_schedule_dynamic.c
index 6d4f59b58..4433d2a3d 100644
--- a/runtime/test/worksharing/for/omp_for_schedule_dynamic.c
+++ b/runtime/test/worksharing/for/omp_for_schedule_dynamic.c
@@ -1,7 +1,7 @@
 // RUN: %libomp-compile-and-run
 /*
  * Test for dynamic scheduling with chunk size
- * Method: caculate how many times the iteration space is dispatched
+ * Method: calculate how many times the iteration space is dispatched
  *     and judge if each dispatch has the requested chunk size
  *     unless it is the last one.
  * It is possible for two adjacent chunks are assigned to the same thread
diff --git a/runtime/test/worksharing/for/omp_for_schedule_runtime.c b/runtime/test/worksharing/for/omp_for_schedule_runtime.c
index b957fc38f..27a76567f 100644
--- a/runtime/test/worksharing/for/omp_for_schedule_runtime.c
+++ b/runtime/test/worksharing/for/omp_for_schedule_runtime.c
@@ -10,6 +10,7 @@
 // RUN: env OMP_SCHEDULE=trapezoidal,13 %libomp-run 101 13
 // RUN: env OMP_SCHEDULE=static_steal %libomp-run 102 1
 // RUN: env OMP_SCHEDULE=static_steal,14 %libomp-run 102 14
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
diff --git a/runtime/test/worksharing/for/omp_for_schedule_static_3.c b/runtime/test/worksharing/for/omp_for_schedule_static_3.c
index 922f27abe..10bee86fa 100644
--- a/runtime/test/worksharing/for/omp_for_schedule_static_3.c
+++ b/runtime/test/worksharing/for/omp_for_schedule_static_3.c
@@ -147,10 +147,10 @@ int test_omp_for_schedule_static_3()
   }
 
   /* Now we check if several loop regions in one parallel region have the
-   * same logical assignement of chunks to threads. We use the nowait
+   * same logical assignment of chunks to threads. We use the nowait
    * clause to increase the probability to get an error. */
 
-  /* First we allocate some more memmory */
+  /* First we allocate some more memory */
   free (tids);
   tids = (int *) malloc (sizeof (int) * LOOPCOUNT);
   tids2 = (int *) malloc (sizeof (int) * LOOPCOUNT);
diff --git a/runtime/test/worksharing/for/omp_nonmonotonic_dynamic1.c b/runtime/test/worksharing/for/omp_nonmonotonic_dynamic1.c
new file mode 100644
index 000000000..0691353fe
--- /dev/null
+++ b/runtime/test/worksharing/for/omp_nonmonotonic_dynamic1.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile
+// RUN: env OMP_SCHEDULE=nonmonotonic:dynamic,10 %libomp-run
+
+// The test checks iterations distribution for OMP 5.0 nonmonotonic OMP_SCHEDULE
+// case #threads > #chunks (fallback to monotonic dynamic)
+
+#include <stdio.h>
+#include <omp.h>
+
+#define ITERS 100
+#define CHUNK 10
+int err = 0;
+
+int main(int argc, char **argv) {
+  int i, ch, it[ITERS];
+  omp_set_num_threads(16); // #threads is bigger than #chunks
+#pragma omp parallel for schedule(runtime)
+  for (i = 0; i < ITERS; ++i) {
+    it[i] = omp_get_thread_num();
+  }
+  // check that each chunk executed by single thread
+  for (ch = 0; ch < ITERS/CHUNK; ++ch) {
+    int iter = ch * CHUNK;
+    int nt = it[iter]; // thread number
+    for (i = 1; i < CHUNK; ++i) {
+#if _DEBUG
+      printf("iter %d: (%d %d)\n", iter + i, nt, it[iter + i]);
+#endif
+      if (nt != it[iter + i]) {
+        err++;
+      }
+    }
+  }
+  if (err > 0) {
+    printf("Failed, err = %d\n", err);
+    return 1;
+  }
+  printf("Passed\n");
+  return 0;
+}
diff --git a/runtime/test/worksharing/for/omp_nonmonotonic_nowait.c b/runtime/test/worksharing/for/omp_nonmonotonic_nowait.c
new file mode 100644
index 000000000..47ea1150b
--- /dev/null
+++ b/runtime/test/worksharing/for/omp_nonmonotonic_nowait.c
@@ -0,0 +1,34 @@
+// RUN: %libomp-compile-and-run
+
+// The test checks nonmonotonic scheduling works correctly when threads
+// may execute different loops concurrently.
+
+#include <stdio.h>
+#include <omp.h>
+
+#define N 200
+#define C 20
+int main()
+{
+  int i, l0 = 0, l1 = 0;
+  #pragma omp parallel num_threads(8)
+  {
+    #pragma omp for schedule(nonmonotonic:dynamic,C) nowait
+    for (i = 0; i < N; ++i) {
+      #pragma omp atomic
+        l0++;
+    }
+    #pragma omp for schedule(nonmonotonic:dynamic,C) nowait
+    for (i = 0; i < N * N; ++i) {
+      #pragma omp atomic
+        l1++;
+    }
+  }
+  if (l0 != N || l1 != N * N) {
+    printf("failed l0 = %d, l1 = %d, should be %d %d\n", l0, l1, N, N * N);
+    return 1;
+  } else {
+    printf("passed\n");
+    return 0;
+  }
+}
diff --git a/runtime/test/worksharing/for/omp_parallel_for_ordered.c b/runtime/test/worksharing/for/omp_parallel_for_ordered.c
index 5fef46040..453f3dd9e 100644
--- a/runtime/test/worksharing/for/omp_parallel_for_ordered.c
+++ b/runtime/test/worksharing/for/omp_parallel_for_ordered.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// REQUIRES: !(abt && (clang || gcc))
 #include <stdio.h>
 #include "omp_testsuite.h"
 
diff --git a/runtime/test/worksharing/for/omp_parallel_for_ordered_nothreadprivate.c b/runtime/test/worksharing/for/omp_parallel_for_ordered_nothreadprivate.c
new file mode 100644
index 000000000..39478d950
--- /dev/null
+++ b/runtime/test/worksharing/for/omp_parallel_for_ordered_nothreadprivate.c
@@ -0,0 +1,55 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+static int last_i = 0;
+
+/*!
+  Utility function: returns true if the passed argument is larger than
+  the argument of the last call of this function.
+ */
+static int check_i_islarger2(int i)
+{
+  int islarger;
+  islarger = (i > last_i);
+  last_i = i;
+  return (islarger);
+}
+
+int test_omp_parallel_for_ordered()
+{
+  int sum;
+  int is_larger;
+  int known_sum;
+  int i;
+
+  sum = 0;
+  is_larger = 1;
+  last_i = 0;
+  #pragma omp parallel for schedule(static,1) private(i) ordered
+  for (i = 1; i < 100; i++) {
+    int ii = i;
+    #pragma omp ordered
+    {
+      is_larger = check_i_islarger2 (ii) && is_larger;
+      sum  = sum + ii;
+    }
+  }
+  known_sum = (99 * 100) / 2;
+  fprintf (stderr," known_sum = %d , sum = %d \n", known_sum, sum);
+  fprintf (stderr," is_larger = %d\n", is_larger);
+  return (known_sum == sum) && is_larger;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_for_ordered()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/worksharing/sections/omp_sections_nowait.c b/runtime/test/worksharing/sections/omp_sections_nowait.c
index caff254f4..8be612a14 100644
--- a/runtime/test/worksharing/sections/omp_sections_nowait.c
+++ b/runtime/test/worksharing/sections/omp_sections_nowait.c
@@ -20,7 +20,8 @@ void wait_for_release_then_increment(int rank)
 {
   fprintf(stderr, "Thread nr %d enters first section"
     " and waits.\n", rank);
-  while (release == 0);
+  while (release == 0)
+    THREAD_SCHED_POINT();
   #pragma omp atomic
   count++;
 }
diff --git a/runtime/test/worksharing/single/omp_single_copyprivate.c b/runtime/test/worksharing/single/omp_single_copyprivate.c
index 2fece5c10..3608f51d9 100644
--- a/runtime/test/worksharing/single/omp_single_copyprivate.c
+++ b/runtime/test/worksharing/single/omp_single_copyprivate.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// REQUIRES: !(abt && (clang || gcc))
 #include "omp_testsuite.h"
 
 #define DEBUG_TEST 0
diff --git a/runtime/test/worksharing/single/omp_single_copyprivate_nothreadprivate.c b/runtime/test/worksharing/single/omp_single_copyprivate_nothreadprivate.c
new file mode 100644
index 000000000..7709e7b7c
--- /dev/null
+++ b/runtime/test/worksharing/single/omp_single_copyprivate_nothreadprivate.c
@@ -0,0 +1,58 @@
+// RUN: %libomp-compile-and-run
+#include "omp_testsuite.h"
+
+#define DEBUG_TEST 0
+
+int test_omp_single_copyprivate()
+{
+  int result;
+  int nr_iterations;
+
+  result = 0;
+  nr_iterations = 0;
+  int j;
+  #pragma omp parallel num_threads(4) private(j)
+  {
+    int i;
+    for (i = 0; i < LOOPCOUNT; i++)
+    {
+#if DEBUG_TEST
+         int thread;
+         thread = omp_get_thread_num ();
+#endif
+      #pragma omp single copyprivate(j)
+      {
+        nr_iterations++;
+        j = i;
+#if DEBUG_TEST
+        printf ("thread %d assigns, j = %d, i = %d\n", thread, j, i);
+#endif
+      }
+#if DEBUG_TEST
+      #pragma omp barrier
+#endif
+      #pragma omp critical
+      {
+#if DEBUG_TEST
+        printf ("thread = %d, j = %d, i = %d\n", thread, j, i);
+#endif
+        result = result + j - i;
+      }
+      #pragma omp barrier
+    } /* end of for */
+  } /* end of parallel */
+  return ((result == 0) && (nr_iterations == LOOPCOUNT));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_single_copyprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/test/worksharing/single/omp_single_nowait.c b/runtime/test/worksharing/single/omp_single_nowait.c
index 22f8930d9..bc4144bfc 100644
--- a/runtime/test/worksharing/single/omp_single_nowait.c
+++ b/runtime/test/worksharing/single/omp_single_nowait.c
@@ -15,7 +15,8 @@ void wait_for_release_then_increment(int rank)
 {
   fprintf(stderr, "Thread nr %d enters first section"
     " and waits.\n", rank);
-  while (release == 0);
+  while (release == 0)
+    THREAD_SCHED_POINT();
   #pragma omp atomic
   count++;
 }
diff --git a/runtime/test/worksharing/single/omp_single_private.c b/runtime/test/worksharing/single/omp_single_private.c
index a27f8de21..cbdd6236a 100644
--- a/runtime/test/worksharing/single/omp_single_private.c
+++ b/runtime/test/worksharing/single/omp_single_private.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// REQUIRES: !(abt && (clang || gcc))
 #include <stdio.h>
 #include "omp_testsuite.h"
 
diff --git a/runtime/test/worksharing/single/omp_single_private_nothreadprivate.c b/runtime/test/worksharing/single/omp_single_private_nothreadprivate.c
new file mode 100644
index 000000000..a711358f5
--- /dev/null
+++ b/runtime/test/worksharing/single/omp_single_private_nothreadprivate.c
@@ -0,0 +1,51 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_single_private()
+{
+  int nr_threads_in_single;
+  int result;
+  int nr_iterations;
+  int i;
+
+  nr_threads_in_single = 0;
+  nr_iterations = 0;
+  result = 0;
+
+  #pragma omp parallel private(i)
+  {
+    int myresult = 0;
+    int myit = 0;
+    for (i = 0; i < LOOPCOUNT; i++) {
+      #pragma omp single private(nr_threads_in_single) nowait
+      {
+        nr_threads_in_single = 0;
+        #pragma omp flush
+        nr_threads_in_single++;
+        #pragma omp flush
+        myit++;
+        myresult = myresult + nr_threads_in_single;
+      }
+    }
+    #pragma omp critical
+    {
+      result += nr_threads_in_single;
+      nr_iterations += myit;
+    }
+  }
+  return ((result == 0) && (nr_iterations == LOOPCOUNT));
+} /* end of check_single private */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_single_private()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/runtime/tools/check-depends.pl b/runtime/tools/check-depends.pl
index 6f8c8af5a..3cd30c2be 100755
--- a/runtime/tools/check-depends.pl
+++ b/runtime/tools/check-depends.pl
@@ -23,7 +23,7 @@
 my $target_arch;
 
 # --------------------------------------------------------------------------------------------------
-# Ouput parse error.
+# Output parse error.
 #     $tool -- Name of tool.
 #     @bulk -- Output of the tool.
 #     $n    -- Number of line caused parse error.
@@ -406,11 +406,11 @@ =head1 DESCRIPTION
 
 C<check-depends.pl> finds direct dependencies for a specified library. List of actual dependencies
 is sorted alphabetically and printed. If list of expected dependencies is specified, the scripts
-checks the library has only allowed dependencies. In case of not expected depndencies the script
+checks the library has only allowed dependencies. In case of not expected dependencies. the script
 issues error message and exits with non-zero code.
 
-Linux* OS and OS X*: The script finds dependencies only for dymamic libraries. Windows* OS: The script
-finds dependencies for either static or dymamic libraries.
+Linux* OS and OS X*: The script finds dependencies only for dynamic libraries. Windows* OS: The script
+finds dependencies for either static or dynamic libraries.
 
 The script uses external tools. On Linux* OS, it runs F<readelf>, on OS X* -- F<otool> (or F<otool64>),
 on Windows* OS -- F<link>.
diff --git a/runtime/tools/check-execstack.pl b/runtime/tools/check-execstack.pl
index 34f77e1c6..e4a8e7c88 100755
--- a/runtime/tools/check-execstack.pl
+++ b/runtime/tools/check-execstack.pl
@@ -74,7 +74,7 @@ =head1 NAME
 
 =head1 SYNOPSIS
 
-B<check-execstack.pl> I<optiion>... I<file>...
+B<check-execstack.pl> I<option>... I<file>...
 
 =head1 DESCRIPTION
 
diff --git a/runtime/tools/check-instruction-set.pl b/runtime/tools/check-instruction-set.pl
index 455210ce1..65c315d59 100755
--- a/runtime/tools/check-instruction-set.pl
+++ b/runtime/tools/check-instruction-set.pl
@@ -124,7 +124,7 @@ ($;$$)
 
     my $n = 0;
     my $errors = 0;
-    my $current_func  = "";    # Name of current fuction.
+    my $current_func  = "";    # Name of current function.
     my $reported_func = "";    # name of last reported function.
     foreach my $line ( @bulk ) {
         ++ $n;
diff --git a/runtime/tools/generate-def.pl b/runtime/tools/generate-def.pl
index 754243c27..771d5f0ef 100755
--- a/runtime/tools/generate-def.pl
+++ b/runtime/tools/generate-def.pl
@@ -94,7 +94,7 @@ ($\%)
 
     if ( @dirs ) {
         my $dir = pop( @dirs );
-        $error->( "Unterminated %if direcive.", $dir->{ n }, $dir->{ line } );
+        $error->( "Unterminated %if directive.", $dir->{ n }, $dir->{ line } );
     }; # while
 
     return %entries;
@@ -108,8 +108,9 @@ (\%)
     foreach my $entry ( keys( %$entries ) ) {
         if ( not $entries->{ $entry }->{ obsolete } ) {
             my $ordinal = $entries->{ $entry }->{ ordinal };
-            # omp_alloc and omp_free are C/C++ only functions, skip "1000+ordinal" for them
-            if ( $entry =~ m{\A[ok]mp_} and $entry ne "omp_alloc" and $entry ne "omp_free" ) {
+            # omp_alloc, omp_calloc, omp_realloc and omp_free are C/C++ only functions, skip "1000+ordinal" for them
+            if ( $entry =~ m{\A[ok]mp_} and $entry ne "omp_alloc" and $entry ne "omp_calloc" and
+                $entry ne "omp_realloc" and $entry ne "omp_free" ) {
                 if ( not defined( $ordinal ) ) {
                     runtime_error(
                         "Bad entry \"$entry\": ordinal number is not specified."
@@ -152,7 +153,7 @@ (\%$)
         print( $bulk );
     }; # if
 
-}; # sub generate_ouput
+}; # sub generate_output
 
 #
 # Parse command line.
@@ -268,7 +269,7 @@ =head1 ARGUMENTS
 =head1 DESCRIPTION
 
 The script reads input file, process conditional directives, checks content for consistency, and
-generates ouptput file suitable for linker.
+generates output file suitable for linker.
 
 =head2 Input File Format
 
@@ -287,7 +288,7 @@ =head2 Input File Format
     %endif
 
 A part of file surrounded by C<%ifdef I<name>> and C<%endif> directives is a conditional part -- it
-has effect only if I<name> is defined in the comman line by B<--define> option. C<%ifndef> is a
+has effect only if I<name> is defined in the command line by B<--define> option. C<%ifndef> is a
 negated version of C<%ifdef> -- conditional part has an effect only if I<name> is B<not> defined.
 
 Conditional parts may be nested.
diff --git a/runtime/tools/lib/Platform.pm b/runtime/tools/lib/Platform.pm
index 0ff7597e5..38593a154 100644
--- a/runtime/tools/lib/Platform.pm
+++ b/runtime/tools/lib/Platform.pm
@@ -187,12 +187,12 @@ sub target_options() {
                 set_target_os( $_[ 1 ] ) or
                     die "Bad value of --target-os option: \"$_[ 1 ]\"\n";
             },
-        "target-architecture|targert-arch|architecture|arch=s" =>
+        "target-architecture|target-arch|architecture|arch=s" =>
            sub {
                set_target_arch( $_[ 1 ] ) or
                    die "Bad value of --target-architecture option: \"$_[ 1 ]\"\n";
            },
-        "target-mic-architecture|targert-mic-arch|mic-architecture|mic-arch=s" =>
+        "target-mic-architecture|target-mic-arch|mic-architecture|mic-arch=s" =>
            sub {
                set_target_mic_arch( $_[ 1 ] ) or
                    die "Bad value of --target-mic-architecture option: \"$_[ 1 ]\"\n";
@@ -390,7 +390,7 @@ naming files, directories, macros, etc.
     my $os     = canon_os( "Windows NT" );     # Returns "win".
 
     print( $host_arch, $host_os, $host_platform );
-    print( $taregt_arch, $target_os, $target_platform );
+    print( $target_arch, $target_os, $target_platform );
 
     tools::get_options(
         Platform::target_options(),
@@ -413,7 +413,7 @@ the script assumes host architecture is target one.
 =item B<canon_arch( $arch )>
 
 Input string is an architecture name to canonize. The function recognizes many variants, for example:
-C<32e>, C<Intel64>, C<Intel(R) 64>, etc. Returned string is a canononized architecture name,
+C<32e>, C<Intel64>, C<Intel(R) 64>, etc. Returned string is a canonized architecture name,
 one of: C<32>, C<32e>, C<64>, C<arm>, C<ppc64le>, C<ppc64>, C<mic>, C<mips>, C<mips64>, C<riscv64> or C<undef> is input string is not recognized.
 
 =item B<legal_arch( $arch )>
@@ -450,7 +450,7 @@ C<--target-architecture=I<str>> and C<--target-os=I<str>> options. Typical usage
         Platform::target_options(),  # Let script recognize --target-os and --target-arch options.
         ...
     );
-    # Initialize variabls after parsing command line.
+    # Initialize variables after parsing command line.
     ( $os, $arch, $platform ) = ( Platform::target_os(), Platform::target_arch(), Platform::target_platform() );
 
 =back
diff --git a/runtime/tools/lib/Uname.pm b/runtime/tools/lib/Uname.pm
index a14cb3a65..99fe1cdbf 100644
--- a/runtime/tools/lib/Uname.pm
+++ b/runtime/tools/lib/Uname.pm
@@ -93,7 +93,7 @@ if ( $^O =~ m/cygwin/i ) {
     # is really requested.
     $values{ fqdn } =
         sub {
-            my $fqdn = Net::Domain::hostfqdn(); # "fqdn" stands for "fully qualified doamain name".
+            my $fqdn = Net::Domain::hostfqdn(); # "fqdn" stands for "fully qualified domain name".
             # On some systems POSIX::uname() and Net::Domain::hostfqdn() reports different names.
             # Let us issue a warning if they significantly different. Names are insignificantly
             # different if POSIX::uname() matches the beginning of Net::Domain::hostfqdn().
@@ -356,7 +356,7 @@ if ( 0 ) {
                $output =~ m{^ProductVersion:\s*(.*)\s*$}m
                    or runtime_error( "There is no ProductVersion in sw_vers output:", $output, "(eof)" );
                my $release = $1;
-               # Sometimes release reported as "10.4.11" (3 componentes), sometimes as "10.6".
+               # Sometimes release reported as "10.4.11" (3 components), sometimes as "10.6".
                # Handle both variants.
                $release =~ m{^(\d+.\d+)(?:\.\d+)?(?=\s|$)}
                    or runtime_error( "Cannot parse OS X* version: $release" );
diff --git a/runtime/tools/lib/tools.pm b/runtime/tools/lib/tools.pm
index cbed6363e..501f54c49 100644
--- a/runtime/tools/lib/tools.pm
+++ b/runtime/tools/lib/tools.pm
@@ -87,7 +87,7 @@ my @warning = ( sub {}, \&warning, \&runtime_error );
 
 sub check_opts(\%$;$) {
 
-    my $opts = shift( @_ );  # Referense to hash containing real options and their values.
+    my $opts = shift( @_ );  # Reference to hash containing real options and their values.
     my $good = shift( @_ );  # Reference to an array containing all known option names.
     my $msg  = shift( @_ );  # Optional (non-mandatory) message.
 
@@ -237,7 +237,7 @@ B<Description:>
 
 It is very simple wrapper arounf Getopt::Long::GetOptions. It passes all arguments to GetOptions,
 and add definitions for standard help options: --help, --doc, --verbose, and --quiet.
-When GetOptions finihes, this subroutine checks exit code, if it is non-zero, standard error
+When GetOptions finishes, this subroutine checks exit code, if it is non-zero, standard error
 message is issued and script terminated.
 
 If --verbose or --quiet option is specified, C<tools.pm_verbose> environment variable is set.
@@ -256,7 +256,7 @@ sub get_options {
         "v|verbose"       => sub { ++ $verbose;     $ENV{ "tools.pm_verbose"    } = $verbose;    },
         "quiet"           => sub { -- $verbose;     $ENV{ "tools.pm_verbose"    } = $verbose;    },
         "with-timestamps" => sub { $timestamps = 1; $ENV{ "tools.pm_timestamps" } = $timestamps; },
-        @_, # Caller argumetsa are at the end so caller options overrides standard.
+        @_, # Caller arguments are at the end so caller options overrides standard.
     ) or cmdline_error();
 
 }; # sub get_options
@@ -333,7 +333,7 @@ B<Synopsis:>
 B<Description:>
 
 Package variable. It determines verbosity level, which affects C<warning()>, C<info()>, and
-C<debug()> subroutnes .
+C<debug()> subroutines .
 
 The variable gets initial value from C<tools.pm_verbose> environment variable if it is exists.
 If the environment variable does not exist, variable is set to 2.
@@ -357,7 +357,7 @@ B<Synopsis:>
 B<Description:>
 
 Package variable. It determines whether C<debug()>, C<info()>, C<warning()>, C<runtime_error()>
-subroutnes print timestamps or not.
+subroutines print timestamps or not.
 
 The variable gets initial value from C<tools.pm_timestamps> environment variable if it is exists.
 If the environment variable does not exist, variable is set to false.
@@ -700,7 +700,7 @@ Look for "echo" in the directories specified in PATH:
 
     my $echo = which( "echo" );
 
-Look for all occurenses of "cp" in the PATH:
+Look for all occurrences of "cp" in the PATH:
 
     my @cps = which( "cp", -all => 1 );
 
@@ -708,10 +708,6 @@ Look for the first occurrence of "icc" in the specified directories:
 
     my $icc = which( "icc", -dirs => [ ".", "/usr/local/bin", "/usr/bin", "/bin" ] );
 
-Look for the the C<omp_lib.f> file:
-
-    my @omp_lib = which( "omp_lib.f", -all => 1, -exec => 0, -dirs => [ @include ] );
-
 =cut
 
 sub which($@) {
@@ -875,7 +871,7 @@ Make a directory.
 This function makes a directory. If necessary, more than one level can be created.
 If directory exists, warning issues (the script behavior depends on value of
 C<-warning_level> option). If directory creation fails or C<$dir> exists but it is not a
-directory, error isssues.
+directory, error issues.
 
 Options:
 
@@ -1488,7 +1484,7 @@ B<Arguments:>
 
 =item B<$file>
 
-The name or handle of file to writte to.
+The name or handle of file to write to.
 
 =item B<$bulk>
 
@@ -1834,7 +1830,7 @@ Examples:
     execute( [ qw( cvs -n -q update . ) ], -stdout => \@output, -stderr => undef );
         # Execute specified command,  output is saved in @output
         # variable, stderr stream is redirected to null device
-        # (/dev/null in Linux* OS an nul in Windows* OS).
+        # (/dev/null in Linux* OS and nul in Windows* OS).
 
 =cut
 
diff --git a/runtime/tools/message-converter.pl b/runtime/tools/message-converter.pl
index d72acf001..b3efb706e 100755
--- a/runtime/tools/message-converter.pl
+++ b/runtime/tools/message-converter.pl
@@ -563,7 +563,7 @@ =head1 OPTIONS
 
 =item B<--os=>I<str>
 
-Specify OS name the message formats to be converted for. If not specified expolicitly, value of
+Specify OS name the message formats to be converted for. If not specified explicitly, value of
 LIBOMP_OS environment variable is used. If LIBOMP_OS is not defined, host OS is detected.
 
 Depending on OS, B<message-converter.pl> converts message formats to GNU style or MS style.
@@ -659,7 +659,7 @@ =head2 Output Files
         kmp_i18n_str_NotANumber,
         ...
 
-        // Set #3, fotrmats.
+        // Set #3, formats.
         ...
 
         kmp_i18n_xxx_lastest
@@ -709,7 +709,7 @@ =head2 Output Files
     2 "Japan"
     3 "1041"
     4 "2"
-    5 "Based on Enlish message catalog revision 20090806"
+    5 "Based on English message catalog revision 20090806"
     ...
 
 Example of Windows* OS message file:
diff --git a/runtime/tools/summarizeStats.py b/runtime/tools/summarizeStats.py
index f2c5f5e6c..7daed2e1c 100644
--- a/runtime/tools/summarizeStats.py
+++ b/runtime/tools/summarizeStats.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 
 import pandas as pd
 import numpy as np
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
new file mode 100644
index 000000000..eefbbf337
--- /dev/null
+++ b/tools/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Discover the tools that use CMake in the subdirectories.
+# Note that explicit cmake invocation is required every time a new tool
+# is added or removed.
+file(GLOB entries *)
+foreach(entry ${entries})
+  if(IS_DIRECTORY ${entry} AND EXISTS ${entry}/CMakeLists.txt)
+    add_subdirectory(${entry})
+  endif()
+endforeach(entry)
diff --git a/tools/analyzer/analyzer.py b/tools/analyzer/analyzer.py
new file mode 100644
index 000000000..bcb9abcf8
--- /dev/null
+++ b/tools/analyzer/analyzer.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+
+import subprocess
+import os.path
+import yaml
+import io
+import re
+
+def parseKernelUsages(usageStr, usageDict):
+    demangler = 'c++filt -p'
+
+    def getKernelMem(usages):
+        match = re.search(r"([0-9]+) bytes cmem\[0\]", usages)
+        return match.group(1) if match else None
+    def getSharedMem(usages):
+        match = re.search(r"([0-9]+) bytes smem", usages)
+        return match.group(1) if match else None
+    def getRegisters(usages):
+        match = re.search(r"[Uu]sed ([0-9]+) registers", usages)
+        return match.group(1) if match else None
+    def demangle(fn):
+        expr = re.compile("__omp_offloading_[a-zA-Z0-9]*_[a-zA-Z0-9]*_(_Z.*_)_l[0-9]*$")
+        match = expr.search(fn)
+        function = match.group(1) if match else fn
+        output = subprocess.run(demangler.split(' ') + [function], check=True, stdout=subprocess.PIPE)
+        return output.stdout.decode('utf-8').strip()
+    def getLine(fn):
+        expr = re.compile("__omp_offloading_[a-zA-Z0-9]*_[a-zA-Z0-9]*_.*_l([0-9]*)$")
+        match = expr.search(fn)
+        return match.group(1) if match else 0
+
+    expr = re.compile("Function properties for \'?([a-zA-Z0-9_]*)\'?\n(.*,.*)\n")
+    for (fn, usages) in expr.findall(usageStr):
+        info = usageDict[fn] if fn in usageDict else dict()
+        info["Name"] = demangle(fn)
+        info["DebugLoc"] = {"File" : "unknown", "Line": getLine(fn), "Column" : 0}
+        info["Usage"] = {"Registers" : getRegisters(usages), "Shared" : getSharedMem(usages), "Kernel" : getKernelMem(usages)}
+        usageDict[fn] = info
+
+def getKernelUsage(stderr, fname='usage.yaml'):
+    remarks = [line for line in stderr.split('\n') if re.search(r"^remark:", line)]
+    ptxas = '\n'.join([line.split(':')[1].strip() for line in stderr.split('\n') if re.search(r"^ptxas info *:", line)])
+    nvlink = '\n'.join([line.split(':')[1].strip() for line in stderr.split('\n') if re.search(r"^nvlink info *:", line)])
+
+    if os.path.exists(fname):
+        with io.open(fname, 'r', encoding = 'utf-8') as f:
+            usage = yaml.load(f, Loader=yaml.Loader)
+    else:
+        usage = dict()
+
+    parseKernelUsages(ptxas, usage)
+    parseKernelUsages(nvlink, usage)
+
+    return usage
diff --git a/tools/analyzer/llvm-openmp-analyzer b/tools/analyzer/llvm-openmp-analyzer
new file mode 100755
index 000000000..259809c2d
--- /dev/null
+++ b/tools/analyzer/llvm-openmp-analyzer
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+
+"""
+A wrapper for Clang specialized for gathering information about OpenMP programs.
+Simple replace calls to clang or clang++ with llvm-openmp-analyzer to run the
+analysis passes.
+"""
+
+import argparse
+import subprocess
+import yaml # PyYaml to save and load analysis information
+import sys
+import io
+
+from analyzer import getKernelUsage
+
+desc = '''A wrapper around clang that runs OpenMP Analysis passes and gathers
+information about OpenMP programs.'''
+
+default_args = ["-fopenmp", "-Rpass=openmp-opt", "-Rpass-missed=openmp-opt", "-Rpass-analysis=openmp-opt"]
+
+def main():
+    compiler = ["clang++"] if sys.argv[0].endswith('++') else ["clang"]
+    parser = argparse.ArgumentParser(description=desc)
+    parser.add_argument('--usage-report-file',
+            metavar='filename',
+            default='usage.yaml',
+            help='Filename used for the OpenMP kernel usage reports in YAML format. "usage.yaml" by default.')
+    parser.add_argument('--no-usage-report', 
+            action='store_true',
+            default=False, 
+            help='Do not general a usage report for the OpenMP kernels.')
+    args, clang_args = parser.parse_known_args()
+
+    subprocess.run(compiler + default_args + clang_args, check=True)
+    output = subprocess.run(compiler + default_args + clang_args + ["-v"], stderr=subprocess.PIPE)
+    stderr = output.stderr.decode('utf-8')
+
+    if not args.no_usage_report:
+        usage = getKernelUsage(stderr, fname=args.usage_report_file)
+        with io.open(args.usage_report_file, 'w', encoding = 'utf-8') as f:
+            yaml.dump(usage, f)
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analyzer/llvm-openmp-analyzer++ b/tools/analyzer/llvm-openmp-analyzer++
new file mode 120000
index 000000000..b45062ac8
--- /dev/null
+++ b/tools/analyzer/llvm-openmp-analyzer++
@@ -0,0 +1 @@
+llvm-openmp-analyzer
\ No newline at end of file
diff --git a/tools/archer/CMakeLists.txt b/tools/archer/CMakeLists.txt
new file mode 100644
index 000000000..b73c24cb1
--- /dev/null
+++ b/tools/archer/CMakeLists.txt
@@ -0,0 +1,37 @@
+# //===----------------------------------------------------------------------===//
+# //
+# // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# // See https://llvm.org/LICENSE.txt for details.
+# // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# //
+# //===----------------------------------------------------------------------===//
+  
+  
+
+if(LIBBOLT_OMPT_SUPPORT)
+  include_directories(${LIBBOLT_INCLUDE_DIR})
+
+  add_library(bolt-archer SHARED ompt-tsan.cpp)
+  add_library(bolt-archer_static STATIC ompt-tsan.cpp)
+
+  install(TARGETS bolt-archer bolt-archer_static
+    LIBRARY DESTINATION ${OPENMP_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${OPENMP_INSTALL_LIBDIR})
+
+  # install aliases for BOLT
+  add_custom_command(TARGET bolt-archer bolt-archer_static POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_SHARED_LIBRARY_PREFIX}bolt-archer${CMAKE_SHARED_LIBRARY_SUFFIX}
+      ${CMAKE_SHARED_LIBRARY_PREFIX}archer${CMAKE_SHARED_LIBRARY_SUFFIX}
+    COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_STATIC_LIBRARY_PREFIX}bolt-archer_static${CMAKE_STATIC_LIBRARY_SUFFIX}
+      ${CMAKE_STATIC_LIBRARY_PREFIX}archer_static${CMAKE_STATIC_LIBRARY_SUFFIX}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  )
+  install(CODE "execute_process(COMMAND \"\${CMAKE_COMMAND}\" -E create_symlink \"${CMAKE_SHARED_LIBRARY_PREFIX}bolt-archer${CMAKE_SHARED_LIBRARY_SUFFIX}\"
+    \"${CMAKE_SHARED_LIBRARY_PREFIX}archer${CMAKE_SHARED_LIBRARY_SUFFIX}\" WORKING_DIRECTORY
+    \$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/${OPENMP_INSTALL_LIBDIR})")
+  install(CODE "execute_process(COMMAND \"\${CMAKE_COMMAND}\" -E create_symlink \"${CMAKE_STATIC_LIBRARY_PREFIX}bolt-archer_static${CMAKE_STATIC_LIBRARY_SUFFIX}\"
+    \"${CMAKE_STATIC_LIBRARY_PREFIX}archer_static${LIBOMP_LIBRARY_SUFFIX}\" WORKING_DIRECTORY
+    \$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/${OPENMP_INSTALL_LIBDIR})")
+
+  add_subdirectory(tests)
+endif()
diff --git a/tools/archer/README.md b/tools/archer/README.md
new file mode 100644
index 000000000..2852fe061
--- /dev/null
+++ b/tools/archer/README.md
@@ -0,0 +1,222 @@
+<div id="table-of-contents">
+<h2>Table of Contents</h2>
+<div id="text-table-of-contents">
+<ul>
+<li><a href="#org8ca70b5">1. License</a></li>
+<li><a href="#orgc6a2b10">2. Introduction</a></li>
+<li><a href="#org9a459f1">3. Installation</a></li>
+<li><a href="#orgb820ad0">4. Usage</a>
+<ul>
+<li><a href="#org213ff1a">4.1. How to compile</a></li>
+<li><a href="#org110062c">4.2. Runtime Flags</a></li>
+</ul>
+</li>
+<li><a href="#org73e58a9">5. Example</a></li>
+<li><a href="#orgcc38a36">6. Contacts and Support</a></li>
+</ul>
+</div>
+</div>
+
+
+<a id="org8ca70b5"></a>
+
+# License
+
+Archer is distributed under the terms of the Apache License.
+
+Please see LICENSE.txt for usage terms.
+
+LLNL-CODE-773957
+
+<a id="orgc6a2b10"></a>
+
+# Introduction
+
+**Archer** is an OMPT tool which annotates OpenMP synchronization semantics for data race
+detection.
+This avoids false alerts in data race detection.
+Archer is automatically loaded for OpenMP applications which are compiled
+with ThreadSanitizer option.
+
+<a id="org9a459f1"></a>
+
+# Build Archer within Clang/LLVM
+
+This distribution of Archer is automatically built with the OpenMP runtime
+and automatically loaded by the OpenMP runtime.
+
+<a id="orgb820ad0"></a>
+
+# Usage
+
+
+<a id="org213ff1a"></a>
+
+## How to compile
+
+To use archer, compile the application with the extra flag
+`-fsanitize=thread`:
+
+    clang -O3 -g -fopenmp -fsanitize=thread app.c
+    clang++ -O3 -g -fopenmp -fsanitize=thread app.cpp
+
+To compile Fortran applications, compile with gfortran, link with clang:
+
+    gfortran -g -c -fopenmp -fsanitize=thread app.f
+    clang -fopenmp -fsanitize=thread app.o -lgfortran
+
+
+<a id="org110062c"></a>
+
+## Runtime Flags
+
+TSan runtime flags are passed via **TSAN&#95;OPTIONS** environment variable,
+we highly recommend the following option to avoid false alerts for the
+OpenMP or MPI runtime implementation:
+
+    export TSAN_OPTIONS="ignore_noninstrumented_modules=1"
+
+
+Runtime flags are passed via **ARCHER&#95;OPTIONS** environment variable,
+different flags are separated by spaces, e.g.:
+
+    ARCHER_OPTIONS="flush_shadow=1" ./myprogram
+
+<table border="2" cellspacing="0" cellpadding="6" rules="groups" frame="hsides">
+
+
+<colgroup>
+<col  class="org-left" />
+
+<col  class="org-right" />
+
+<col  class="org-left" />
+</colgroup>
+<thead>
+<tr>
+<th scope="col" class="org-left">Flag Name</th>
+<th scope="col" class="org-right">Default value</th>
+<th scope="col" class="org-left">Description</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="org-left">flush&#95;shadow</td>
+<td class="org-right">0</td>
+<td class="org-left">Flush shadow memory at the end of an outer OpenMP
+parallel region. Our experiments show that this can reduce memory overhead
+by ~30% and runtime overhead by ~10%. This flag is useful for large OpenMP
+applications that typically require large amounts of memory, causing
+out-of-memory exceptions when checked by Archer.</td>
+</tr>
+</tbody>
+
+<tbody>
+<tr>
+<td class="org-left">print&#95;max&#95;rss</td>
+<td class="org-right">0</td>
+<td class="org-left">Print the RSS memory peak at the end of the execution.</td>
+</tr>
+</tbody>
+
+<tbody>
+<tr>
+<td class="org-left">ignore&#95;serial</td>
+<td class="org-right">0</td>
+<td class="org-left">Turn off tracking and analysis of memory accesses in
+the sequential part of an OpenMP program. (Only effective when OpenMP
+runtime is initialized. In doubt, insert omp_get_max_threads() as first
+statement in main!)</td>
+</tr>
+</tbody>
+
+<tbody>
+<tr>
+<td class="org-left">verbose</td>
+<td class="org-right">0</td>
+<td class="org-left">Print startup information.</td>
+</tr>
+</tbody>
+
+<tbody>
+<tr>
+<td class="org-left">enable</td>
+<td class="org-right">1</td>
+<td class="org-left">Use Archer runtime library during execution.</td>
+</tr>
+</tbody>
+</table>
+
+
+<a id="org73e58a9"></a>
+
+# Example
+
+Let us take the program below and follow the steps to compile and
+check the program for data races.
+
+Suppose our program is called *myprogram.c*:
+
+     1  #include <stdio.h>
+     2
+     3  #define N 1000
+     4
+     5  int main (int argc, char **argv)
+     6  {
+     7    int a[N];
+     8
+     9  #pragma omp parallel for
+    10    for (int i = 0; i < N - 1; i++) {
+    11      a[i] = a[i + 1];
+    12    }
+    13  }
+
+We compile the program as follow:
+
+    clang -fsanitize=thread -fopenmp -g myprogram.c -o myprogram
+
+Now we can run the program with the following commands:
+
+    export OMP_NUM_THREADS=2
+    ./myprogram
+
+Archer will output a report in case it finds data races. In our case
+the report will look as follow:
+
+    ==================
+    WARNING: ThreadSanitizer: data race (pid=13641)
+      Read of size 4 at 0x7fff79a01170 by main thread:
+        #0 .omp_outlined. myprogram.c:11:12 (myprogram+0x00000049b5a2)
+        #1 __kmp_invoke_microtask <null> (libomp.so+0x000000077842)
+        #2 __libc_start_main /build/glibc-t3gR2i/glibc-2.23/csu/../csu/libc-start.c:291 (libc.so.6+0x00000002082f)
+
+      Previous write of size 4 at 0x7fff79a01170 by thread T1:
+        #0 .omp_outlined. myprogram.c:11:10 (myprogram+0x00000049b5d6)
+        #1 __kmp_invoke_microtask <null> (libomp.so+0x000000077842)
+
+      Location is stack of main thread.
+
+      Thread T1 (tid=13643, running) created by main thread at:
+        #0 pthread_create tsan_interceptors.cc:902:3 (myprogram+0x00000043db75)
+        #1 __kmp_create_worker <null> (libomp.so+0x00000006c364)
+        #2 __libc_start_main /build/glibc-t3gR2i/glibc-2.23/csu/../csu/libc-start.c:291 (libc.so.6+0x00000002082f)
+
+    SUMMARY: ThreadSanitizer: data race myprogram.c:11:12 in .omp_outlined.
+    ==================
+    ThreadSanitizer: reported 1 warnings
+
+
+<a id="orgcc38a36"></a>
+
+# Contacts and Support
+
+-   [Google group](https://groups.google.com/forum/#!forum/archer-pruner)
+-   [Slack Channel](https://pruners.slack.com)
+
+    <ul style="list-style-type:circle"> <li> For an invitation please write an email to <a href="mailto:simone@cs.utah.edu?Subject=[archer-slack] Slack Invitation" target="_top">Simone Atzeni</a> with a reason why you want to be part of the PRUNERS Slack Team. </li> </ul>
+-   E-Mail Contacts:
+
+    <ul style="list-style-type:circle"> <li> <a href="mailto:simone@cs.utah.edu?Subject=[archer-dev]%20" target="_top">Simone Atzeni</a> </li> <li> <a href="mailto:protze@itc.rwth-aachen.de?Subject=[archer-dev]%20" target="_top">Joachim Protze</a> </li> </ul>
+
+
diff --git a/tools/archer/ompt-tsan.cpp b/tools/archer/ompt-tsan.cpp
new file mode 100644
index 000000000..3ab0eadbb
--- /dev/null
+++ b/tools/archer/ompt-tsan.cpp
@@ -0,0 +1,1040 @@
+/*
+ * ompt-tsan.cpp -- Archer runtime library, TSan annotations for Archer
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <inttypes.h>
+#include <iostream>
+#include <list>
+#include <mutex>
+#include <sstream>
+#include <stack>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#if (defined __APPLE__ && defined __MACH__)
+#include <dlfcn.h>
+#endif
+
+#include "omp-tools.h"
+#include <sys/resource.h>
+
+// Define attribute that indicates that the fall through from the previous
+// case label is intentional and should not be diagnosed by a compiler
+//   Code from libcxx/include/__config
+// Use a function like macro to imply that it must be followed by a semicolon
+#if __cplusplus > 201402L && __has_cpp_attribute(fallthrough)
+#define KMP_FALLTHROUGH() [[fallthrough]]
+#elif __has_cpp_attribute(clang::fallthrough)
+#define KMP_FALLTHROUGH() [[clang::fallthrough]]
+#elif __has_attribute(fallthrough) || __GNUC__ >= 7
+#define KMP_FALLTHROUGH() __attribute__((__fallthrough__))
+#else
+#define KMP_FALLTHROUGH() ((void)0)
+#endif
+
+static int runOnTsan;
+static int hasReductionCallback;
+
+class ArcherFlags {
+public:
+#if (LLVM_VERSION) >= 40
+  int flush_shadow{0};
+#endif
+  int print_max_rss{0};
+  int verbose{0};
+  int enabled{1};
+  int ignore_serial{0};
+
+  ArcherFlags(const char *env) {
+    if (env) {
+      std::vector<std::string> tokens;
+      std::string token;
+      std::string str(env);
+      std::istringstream iss(str);
+      while (std::getline(iss, token, ' '))
+        tokens.push_back(token);
+
+      for (std::vector<std::string>::iterator it = tokens.begin();
+           it != tokens.end(); ++it) {
+#if (LLVM_VERSION) >= 40
+        if (sscanf(it->c_str(), "flush_shadow=%d", &flush_shadow))
+          continue;
+#endif
+        if (sscanf(it->c_str(), "print_max_rss=%d", &print_max_rss))
+          continue;
+        if (sscanf(it->c_str(), "verbose=%d", &verbose))
+          continue;
+        if (sscanf(it->c_str(), "enable=%d", &enabled))
+          continue;
+        if (sscanf(it->c_str(), "ignore_serial=%d", &ignore_serial))
+          continue;
+        std::cerr << "Illegal values for ARCHER_OPTIONS variable: " << token
+                  << std::endl;
+      }
+    }
+  }
+};
+
+class TsanFlags {
+public:
+  int ignore_noninstrumented_modules;
+
+  TsanFlags(const char *env) : ignore_noninstrumented_modules(0) {
+    if (env) {
+      std::vector<std::string> tokens;
+      std::string str(env);
+      auto end = str.end();
+      auto it = str.begin();
+      auto is_sep = [](char c) {
+        return c == ' ' || c == ',' || c == ':' || c == '\n' || c == '\t' ||
+               c == '\r';
+      };
+      while (it != end) {
+        auto next_it = std::find_if(it, end, is_sep);
+        tokens.emplace_back(it, next_it);
+        it = next_it;
+        if (it != end) {
+          ++it;
+        }
+      }
+
+      for (const auto &token : tokens) {
+        // we are interested in ignore_noninstrumented_modules to print a
+        // warning
+        if (sscanf(token.c_str(), "ignore_noninstrumented_modules=%d",
+                   &ignore_noninstrumented_modules))
+          continue;
+      }
+    }
+  }
+};
+
+#if (LLVM_VERSION) >= 40
+extern "C" {
+int __attribute__((weak)) __archer_get_omp_status();
+void __attribute__((weak)) __tsan_flush_memory() {}
+}
+#endif
+ArcherFlags *archer_flags;
+
+// The following definitions are pasted from "llvm/Support/Compiler.h" to allow
+// the code
+// to be compiled with other compilers like gcc:
+
+#ifndef TsanHappensBefore
+// Thread Sanitizer is a tool that finds races in code.
+// See http://code.google.com/p/data-race-test/wiki/DynamicAnnotations .
+// tsan detects these exact functions by name.
+extern "C" {
+#if (defined __APPLE__ && defined __MACH__)
+static void AnnotateHappensAfter(const char *file, int line,
+                                 const volatile void *cv) {
+  void (*fptr)(const char *, int, const volatile void *);
+
+  fptr = (void (*)(const char *, int, const volatile void *))dlsym(
+      RTLD_DEFAULT, "AnnotateHappensAfter");
+  (*fptr)(file, line, cv);
+}
+static void AnnotateHappensBefore(const char *file, int line,
+                                  const volatile void *cv) {
+  void (*fptr)(const char *, int, const volatile void *);
+
+  fptr = (void (*)(const char *, int, const volatile void *))dlsym(
+      RTLD_DEFAULT, "AnnotateHappensBefore");
+  (*fptr)(file, line, cv);
+}
+static void AnnotateIgnoreWritesBegin(const char *file, int line) {
+  void (*fptr)(const char *, int);
+
+  fptr = (void (*)(const char *, int))dlsym(RTLD_DEFAULT,
+                                            "AnnotateIgnoreWritesBegin");
+  (*fptr)(file, line);
+}
+static void AnnotateIgnoreWritesEnd(const char *file, int line) {
+  void (*fptr)(const char *, int);
+
+  fptr = (void (*)(const char *, int))dlsym(RTLD_DEFAULT,
+                                            "AnnotateIgnoreWritesEnd");
+  (*fptr)(file, line);
+}
+static void AnnotateNewMemory(const char *file, int line,
+                              const volatile void *cv, size_t size) {
+  void (*fptr)(const char *, int, const volatile void *, size_t);
+
+  fptr = (void (*)(const char *, int, const volatile void *, size_t))dlsym(
+      RTLD_DEFAULT, "AnnotateNewMemory");
+  (*fptr)(file, line, cv, size);
+}
+static int RunningOnValgrind() {
+  int (*fptr)();
+
+  fptr = (int (*)())dlsym(RTLD_DEFAULT, "RunningOnValgrind");
+  if (fptr && fptr != RunningOnValgrind)
+    runOnTsan = 0;
+  return 0;
+}
+#else
+void __attribute__((weak))
+AnnotateHappensAfter(const char *file, int line, const volatile void *cv) {}
+void __attribute__((weak))
+AnnotateHappensBefore(const char *file, int line, const volatile void *cv) {}
+void __attribute__((weak))
+AnnotateIgnoreWritesBegin(const char *file, int line) {}
+void __attribute__((weak)) AnnotateIgnoreWritesEnd(const char *file, int line) {
+}
+void __attribute__((weak))
+AnnotateNewMemory(const char *file, int line, const volatile void *cv,
+                  size_t size) {}
+int __attribute__((weak)) RunningOnValgrind() {
+  runOnTsan = 0;
+  return 0;
+}
+void __attribute__((weak)) __tsan_func_entry(const void *call_pc) {}
+void __attribute__((weak)) __tsan_func_exit(void) {}
+#endif
+}
+
+// This marker is used to define a happens-before arc. The race detector will
+// infer an arc from the begin to the end when they share the same pointer
+// argument.
+#define TsanHappensBefore(cv) AnnotateHappensBefore(__FILE__, __LINE__, cv)
+
+// This marker defines the destination of a happens-before arc.
+#define TsanHappensAfter(cv) AnnotateHappensAfter(__FILE__, __LINE__, cv)
+
+// Ignore any races on writes between here and the next TsanIgnoreWritesEnd.
+#define TsanIgnoreWritesBegin() AnnotateIgnoreWritesBegin(__FILE__, __LINE__)
+
+// Resume checking for racy writes.
+#define TsanIgnoreWritesEnd() AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
+
+// We don't really delete the clock for now
+#define TsanDeleteClock(cv)
+
+// newMemory
+#define TsanNewMemory(addr, size)                                              \
+  AnnotateNewMemory(__FILE__, __LINE__, addr, size)
+#define TsanFreeMemory(addr, size)                                             \
+  AnnotateNewMemory(__FILE__, __LINE__, addr, size)
+#endif
+
+// Function entry/exit
+#define TsanFuncEntry(pc) __tsan_func_entry(pc)
+#define TsanFuncExit() __tsan_func_exit()
+
+/// Required OMPT inquiry functions.
+static ompt_get_parallel_info_t ompt_get_parallel_info;
+static ompt_get_thread_data_t ompt_get_thread_data;
+
+typedef uint64_t ompt_tsan_clockid;
+
+static uint64_t my_next_id() {
+  static uint64_t ID = 0;
+  uint64_t ret = __sync_fetch_and_add(&ID, 1);
+  return ret;
+}
+
+// Data structure to provide a threadsafe pool of reusable objects.
+// DataPool<Type of objects, Size of blockalloc>
+template <typename T, int N> struct DataPool {
+  std::mutex DPMutex;
+  std::stack<T *> DataPointer;
+  std::list<void *> memory;
+  int total;
+
+  void newDatas() {
+    // prefix the Data with a pointer to 'this', allows to return memory to
+    // 'this',
+    // without explicitly knowing the source.
+    //
+    // To reduce lock contention, we use thread local DataPools, but Data
+    // objects move to other threads.
+    // The strategy is to get objects from local pool. Only if the object moved
+    // to another
+    // thread, we might see a penalty on release (returnData).
+    // For "single producer" pattern, a single thread creates tasks, these are
+    // executed by other threads.
+    // The master will have a high demand on TaskData, so return after use.
+    struct pooldata {
+      DataPool<T, N> *dp;
+      T data;
+    };
+    // We alloc without initialize the memory. We cannot call constructors.
+    // Therefore use malloc!
+    pooldata *datas = (pooldata *)malloc(sizeof(pooldata) * N);
+    memory.push_back(datas);
+    for (int i = 0; i < N; i++) {
+      datas[i].dp = this;
+      DataPointer.push(&(datas[i].data));
+    }
+    total += N;
+  }
+
+  T *getData() {
+    T *ret;
+    DPMutex.lock();
+    if (DataPointer.empty())
+      newDatas();
+    ret = DataPointer.top();
+    DataPointer.pop();
+    DPMutex.unlock();
+    return ret;
+  }
+
+  void returnData(T *data) {
+    DPMutex.lock();
+    DataPointer.push(data);
+    DPMutex.unlock();
+  }
+
+  void getDatas(int n, T **datas) {
+    DPMutex.lock();
+    for (int i = 0; i < n; i++) {
+      if (DataPointer.empty())
+        newDatas();
+      datas[i] = DataPointer.top();
+      DataPointer.pop();
+    }
+    DPMutex.unlock();
+  }
+
+  void returnDatas(int n, T **datas) {
+    DPMutex.lock();
+    for (int i = 0; i < n; i++) {
+      DataPointer.push(datas[i]);
+    }
+    DPMutex.unlock();
+  }
+
+  DataPool() : DPMutex(), DataPointer(), total(0) {}
+
+  ~DataPool() {
+    // we assume all memory is returned when the thread finished / destructor is
+    // called
+    for (auto i : memory)
+      if (i)
+        free(i);
+  }
+};
+
+// This function takes care to return the data to the originating DataPool
+// A pointer to the originating DataPool is stored just before the actual data.
+template <typename T, int N> static void retData(void *data) {
+  ((DataPool<T, N> **)data)[-1]->returnData((T *)data);
+}
+
+struct ParallelData;
+__thread DataPool<ParallelData, 4> *pdp;
+
+/// Data structure to store additional information for parallel regions.
+struct ParallelData {
+
+  // Parallel fork is just another barrier, use Barrier[1]
+
+  /// Two addresses for relationships with barriers.
+  ompt_tsan_clockid Barrier[2];
+
+  const void *codePtr;
+
+  void *GetParallelPtr() { return &(Barrier[1]); }
+
+  void *GetBarrierPtr(unsigned Index) { return &(Barrier[Index]); }
+
+  ParallelData(const void *codeptr) : codePtr(codeptr) {}
+  ~ParallelData() {
+    TsanDeleteClock(&(Barrier[0]));
+    TsanDeleteClock(&(Barrier[1]));
+  }
+  // overload new/delete to use DataPool for memory management.
+  void *operator new(size_t size) { return pdp->getData(); }
+  void operator delete(void *p, size_t) { retData<ParallelData, 4>(p); }
+};
+
+static inline ParallelData *ToParallelData(ompt_data_t *parallel_data) {
+  return reinterpret_cast<ParallelData *>(parallel_data->ptr);
+}
+
+struct Taskgroup;
+__thread DataPool<Taskgroup, 4> *tgp;
+
+/// Data structure to support stacking of taskgroups and allow synchronization.
+struct Taskgroup {
+  /// Its address is used for relationships of the taskgroup's task set.
+  ompt_tsan_clockid Ptr;
+
+  /// Reference to the parent taskgroup.
+  Taskgroup *Parent;
+
+  Taskgroup(Taskgroup *Parent) : Parent(Parent) {}
+  ~Taskgroup() { TsanDeleteClock(&Ptr); }
+
+  void *GetPtr() { return &Ptr; }
+  // overload new/delete to use DataPool for memory management.
+  void *operator new(size_t size) { return tgp->getData(); }
+  void operator delete(void *p, size_t) { retData<Taskgroup, 4>(p); }
+};
+
+struct TaskData;
+__thread DataPool<TaskData, 4> *tdp;
+
+/// Data structure to store additional information for tasks.
+struct TaskData {
+  /// Its address is used for relationships of this task.
+  ompt_tsan_clockid Task;
+
+  /// Child tasks use its address to declare a relationship to a taskwait in
+  /// this task.
+  ompt_tsan_clockid Taskwait;
+
+  /// Whether this task is currently executing a barrier.
+  bool InBarrier;
+
+  /// Whether this task is an included task.
+  int TaskType{0};
+
+  /// Index of which barrier to use next.
+  char BarrierIndex;
+
+  /// Count how often this structure has been put into child tasks + 1.
+  std::atomic_int RefCount;
+
+  /// Reference to the parent that created this task.
+  TaskData *Parent;
+
+  /// Reference to the implicit task in the stack above this task.
+  TaskData *ImplicitTask;
+
+  /// Reference to the team of this task.
+  ParallelData *Team;
+
+  /// Reference to the current taskgroup that this task either belongs to or
+  /// that it just created.
+  Taskgroup *TaskGroup;
+
+  /// Dependency information for this task.
+  ompt_dependence_t *Dependencies;
+
+  /// Number of dependency entries.
+  unsigned DependencyCount;
+
+  void *PrivateData;
+  size_t PrivateDataSize;
+
+  int execution;
+  int freed;
+
+  TaskData(TaskData *Parent, int taskType)
+      : InBarrier(false), TaskType(taskType), BarrierIndex(0), RefCount(1),
+        Parent(Parent), ImplicitTask(nullptr), Team(Parent->Team),
+        TaskGroup(nullptr), DependencyCount(0), execution(0), freed(0) {
+    if (Parent != nullptr) {
+      Parent->RefCount++;
+      // Copy over pointer to taskgroup. This task may set up its own stack
+      // but for now belongs to its parent's taskgroup.
+      TaskGroup = Parent->TaskGroup;
+    }
+  }
+
+  TaskData(ParallelData *Team, int taskType)
+      : InBarrier(false), TaskType(taskType), BarrierIndex(0), RefCount(1),
+        Parent(nullptr), ImplicitTask(this), Team(Team), TaskGroup(nullptr),
+        DependencyCount(0), execution(1), freed(0) {}
+
+  ~TaskData() {
+    TsanDeleteClock(&Task);
+    TsanDeleteClock(&Taskwait);
+  }
+
+  bool isIncluded() { return TaskType & ompt_task_undeferred; }
+  bool isUntied() { return TaskType & ompt_task_untied; }
+  bool isFinal() { return TaskType & ompt_task_final; }
+  bool isMergable() { return TaskType & ompt_task_mergeable; }
+  bool isMerged() { return TaskType & ompt_task_merged; }
+
+  bool isExplicit() { return TaskType & ompt_task_explicit; }
+  bool isImplicit() { return TaskType & ompt_task_implicit; }
+  bool isInitial() { return TaskType & ompt_task_initial; }
+  bool isTarget() { return TaskType & ompt_task_target; }
+
+  void *GetTaskPtr() { return &Task; }
+
+  void *GetTaskwaitPtr() { return &Taskwait; }
+  // overload new/delete to use DataPool for memory management.
+  void *operator new(size_t size) { return tdp->getData(); }
+  void operator delete(void *p, size_t) { retData<TaskData, 4>(p); }
+};
+
+static inline TaskData *ToTaskData(ompt_data_t *task_data) {
+  return reinterpret_cast<TaskData *>(task_data->ptr);
+}
+
+static inline void *ToInAddr(void *OutAddr) {
+  // FIXME: This will give false negatives when a second variable lays directly
+  //        behind a variable that only has a width of 1 byte.
+  //        Another approach would be to "negate" the address or to flip the
+  //        first bit...
+  return reinterpret_cast<char *>(OutAddr) + 1;
+}
+
+/// Store a mutex for each wait_id to resolve race condition with callbacks.
+std::unordered_map<ompt_wait_id_t, std::mutex> Locks;
+std::mutex LocksMutex;
+
+static void ompt_tsan_thread_begin(ompt_thread_t thread_type,
+                                   ompt_data_t *thread_data) {
+  pdp = new DataPool<ParallelData, 4>;
+  TsanNewMemory(pdp, sizeof(pdp));
+  tgp = new DataPool<Taskgroup, 4>;
+  TsanNewMemory(tgp, sizeof(tgp));
+  tdp = new DataPool<TaskData, 4>;
+  TsanNewMemory(tdp, sizeof(tdp));
+  thread_data->value = my_next_id();
+}
+
+static void ompt_tsan_thread_end(ompt_data_t *thread_data) {
+  delete pdp;
+  delete tgp;
+  delete tdp;
+}
+
+/// OMPT event callbacks for handling parallel regions.
+
+static void ompt_tsan_parallel_begin(ompt_data_t *parent_task_data,
+                                     const ompt_frame_t *parent_task_frame,
+                                     ompt_data_t *parallel_data,
+                                     uint32_t requested_team_size, int flag,
+                                     const void *codeptr_ra) {
+  ParallelData *Data = new ParallelData(codeptr_ra);
+  parallel_data->ptr = Data;
+
+  TsanHappensBefore(Data->GetParallelPtr());
+  if (archer_flags->ignore_serial && ToTaskData(parent_task_data)->isInitial())
+    TsanIgnoreWritesEnd();
+}
+
+static void ompt_tsan_parallel_end(ompt_data_t *parallel_data,
+                                   ompt_data_t *task_data, int flag,
+                                   const void *codeptr_ra) {
+  if (archer_flags->ignore_serial && ToTaskData(task_data)->isInitial())
+    TsanIgnoreWritesBegin();
+  ParallelData *Data = ToParallelData(parallel_data);
+  TsanHappensAfter(Data->GetBarrierPtr(0));
+  TsanHappensAfter(Data->GetBarrierPtr(1));
+
+  delete Data;
+
+#if (LLVM_VERSION >= 40)
+  if (&__archer_get_omp_status) {
+    if (__archer_get_omp_status() == 0 && archer_flags->flush_shadow)
+      __tsan_flush_memory();
+  }
+#endif
+}
+
+static void ompt_tsan_implicit_task(ompt_scope_endpoint_t endpoint,
+                                    ompt_data_t *parallel_data,
+                                    ompt_data_t *task_data,
+                                    unsigned int team_size,
+                                    unsigned int thread_num, int type) {
+  switch (endpoint) {
+  case ompt_scope_begin:
+    if (type & ompt_task_initial) {
+      parallel_data->ptr = new ParallelData(nullptr);
+    }
+    task_data->ptr = new TaskData(ToParallelData(parallel_data), type);
+    TsanHappensAfter(ToParallelData(parallel_data)->GetParallelPtr());
+    TsanFuncEntry(ToParallelData(parallel_data)->codePtr);
+    break;
+  case ompt_scope_end: {
+    TaskData *Data = ToTaskData(task_data);
+    assert(Data->freed == 0 && "Implicit task end should only be called once!");
+    Data->freed = 1;
+    assert(Data->RefCount == 1 &&
+           "All tasks should have finished at the implicit barrier!");
+    delete Data;
+    TsanFuncExit();
+    break;
+  }
+  case ompt_scope_beginend:
+    // Should not occur according to OpenMP 5.1
+    // Tested in OMPT tests
+    break;
+  }
+}
+
+static void ompt_tsan_sync_region(ompt_sync_region_t kind,
+                                  ompt_scope_endpoint_t endpoint,
+                                  ompt_data_t *parallel_data,
+                                  ompt_data_t *task_data,
+                                  const void *codeptr_ra) {
+  TaskData *Data = ToTaskData(task_data);
+  switch (endpoint) {
+  case ompt_scope_begin:
+  case ompt_scope_beginend:
+    TsanFuncEntry(codeptr_ra);
+    switch (kind) {
+    case ompt_sync_region_barrier_implementation:
+    case ompt_sync_region_barrier_implicit:
+    case ompt_sync_region_barrier_explicit:
+    case ompt_sync_region_barrier_implicit_parallel:
+    case ompt_sync_region_barrier_implicit_workshare:
+    case ompt_sync_region_barrier_teams:
+    case ompt_sync_region_barrier: {
+      char BarrierIndex = Data->BarrierIndex;
+      TsanHappensBefore(Data->Team->GetBarrierPtr(BarrierIndex));
+
+      if (hasReductionCallback < ompt_set_always) {
+        // We ignore writes inside the barrier. These would either occur during
+        // 1. reductions performed by the runtime which are guaranteed to be
+        // race-free.
+        // 2. execution of another task.
+        // For the latter case we will re-enable tracking in task_switch.
+        Data->InBarrier = true;
+        TsanIgnoreWritesBegin();
+      }
+
+      break;
+    }
+
+    case ompt_sync_region_taskwait:
+      break;
+
+    case ompt_sync_region_taskgroup:
+      Data->TaskGroup = new Taskgroup(Data->TaskGroup);
+      break;
+
+    case ompt_sync_region_reduction:
+      // should never be reached
+      break;
+
+    }
+    if (endpoint == ompt_scope_begin)
+      break;
+    KMP_FALLTHROUGH();
+  case ompt_scope_end:
+    TsanFuncExit();
+    switch (kind) {
+    case ompt_sync_region_barrier_implementation:
+    case ompt_sync_region_barrier_implicit:
+    case ompt_sync_region_barrier_explicit:
+    case ompt_sync_region_barrier_implicit_parallel:
+    case ompt_sync_region_barrier_implicit_workshare:
+    case ompt_sync_region_barrier_teams:
+    case ompt_sync_region_barrier: {
+      if (hasReductionCallback < ompt_set_always) {
+        // We want to track writes after the barrier again.
+        Data->InBarrier = false;
+        TsanIgnoreWritesEnd();
+      }
+
+      char BarrierIndex = Data->BarrierIndex;
+      // Barrier will end after it has been entered by all threads.
+      if (parallel_data)
+        TsanHappensAfter(Data->Team->GetBarrierPtr(BarrierIndex));
+
+      // It is not guaranteed that all threads have exited this barrier before
+      // we enter the next one. So we will use a different address.
+      // We are however guaranteed that this current barrier is finished
+      // by the time we exit the next one. So we can then reuse the first
+      // address.
+      Data->BarrierIndex = (BarrierIndex + 1) % 2;
+      break;
+    }
+
+    case ompt_sync_region_taskwait: {
+      if (Data->execution > 1)
+        TsanHappensAfter(Data->GetTaskwaitPtr());
+      break;
+    }
+
+    case ompt_sync_region_taskgroup: {
+      assert(Data->TaskGroup != nullptr &&
+             "Should have at least one taskgroup!");
+
+      TsanHappensAfter(Data->TaskGroup->GetPtr());
+
+      // Delete this allocated taskgroup, all descendent task are finished by
+      // now.
+      Taskgroup *Parent = Data->TaskGroup->Parent;
+      delete Data->TaskGroup;
+      Data->TaskGroup = Parent;
+      break;
+    }
+
+    case ompt_sync_region_reduction:
+      // Should not occur according to OpenMP 5.1
+      // Tested in OMPT tests
+      break;
+
+    }
+    break;
+  }
+}
+
+static void ompt_tsan_reduction(ompt_sync_region_t kind,
+                                ompt_scope_endpoint_t endpoint,
+                                ompt_data_t *parallel_data,
+                                ompt_data_t *task_data,
+                                const void *codeptr_ra) {
+  switch (endpoint) {
+  case ompt_scope_begin:
+    switch (kind) {
+    case ompt_sync_region_reduction:
+      TsanIgnoreWritesBegin();
+      break;
+    default:
+      break;
+    }
+    break;
+  case ompt_scope_end:
+    switch (kind) {
+    case ompt_sync_region_reduction:
+      TsanIgnoreWritesEnd();
+      break;
+    default:
+      break;
+    }
+    break;
+  case ompt_scope_beginend:
+    // Should not occur according to OpenMP 5.1
+    // Tested in OMPT tests
+    // Would have no implications for DR detection
+    break;
+  }
+}
+
+/// OMPT event callbacks for handling tasks.
+
+static void ompt_tsan_task_create(
+    ompt_data_t *parent_task_data,    /* id of parent task            */
+    const ompt_frame_t *parent_frame, /* frame data for parent task   */
+    ompt_data_t *new_task_data,       /* id of created task           */
+    int type, int has_dependences,
+    const void *codeptr_ra) /* pointer to outlined function */
+{
+  TaskData *Data;
+  assert(new_task_data->ptr == NULL &&
+         "Task data should be initialized to NULL");
+  if (type & ompt_task_initial) {
+    ompt_data_t *parallel_data;
+    int team_size = 1;
+    ompt_get_parallel_info(0, &parallel_data, &team_size);
+    ParallelData *PData = new ParallelData(nullptr);
+    parallel_data->ptr = PData;
+
+    Data = new TaskData(PData, type);
+    new_task_data->ptr = Data;
+  } else if (type & ompt_task_undeferred) {
+    Data = new TaskData(ToTaskData(parent_task_data), type);
+    new_task_data->ptr = Data;
+  } else if (type & ompt_task_explicit || type & ompt_task_target) {
+    Data = new TaskData(ToTaskData(parent_task_data), type);
+    new_task_data->ptr = Data;
+
+    // Use the newly created address. We cannot use a single address from the
+    // parent because that would declare wrong relationships with other
+    // sibling tasks that may be created before this task is started!
+    TsanHappensBefore(Data->GetTaskPtr());
+    ToTaskData(parent_task_data)->execution++;
+  }
+}
+
+static void __ompt_tsan_release_task(TaskData *task) {
+  while (task != nullptr && --task->RefCount == 0) {
+    TaskData *Parent = task->Parent;
+    if (task->DependencyCount > 0) {
+      delete[] task->Dependencies;
+    }
+    delete task;
+    task = Parent;
+  }
+}
+
+static void ompt_tsan_task_schedule(ompt_data_t *first_task_data,
+                                    ompt_task_status_t prior_task_status,
+                                    ompt_data_t *second_task_data) {
+
+  //
+  //  The necessary action depends on prior_task_status:
+  //
+  //    ompt_task_early_fulfill = 5,
+  //     -> ignored
+  //
+  //    ompt_task_late_fulfill  = 6,
+  //     -> first completed, first freed, second ignored
+  //
+  //    ompt_task_complete      = 1,
+  //    ompt_task_cancel        = 3,
+  //     -> first completed, first freed, second starts
+  //
+  //    ompt_task_detach        = 4,
+  //    ompt_task_yield         = 2,
+  //    ompt_task_switch        = 7
+  //     -> first suspended, second starts
+  //
+
+  if (prior_task_status == ompt_task_early_fulfill)
+    return;
+
+  TaskData *FromTask = ToTaskData(first_task_data);
+
+  // Legacy handling for missing reduction callback
+  if (hasReductionCallback < ompt_set_always && FromTask->InBarrier) {
+    // We want to ignore writes in the runtime code during barriers,
+    // but not when executing tasks with user code!
+    TsanIgnoreWritesEnd();
+  }
+
+  // The late fulfill happens after the detached task finished execution
+  if (prior_task_status == ompt_task_late_fulfill)
+    TsanHappensAfter(FromTask->GetTaskPtr());
+
+  // task completed execution
+  if (prior_task_status == ompt_task_complete ||
+      prior_task_status == ompt_task_cancel ||
+      prior_task_status == ompt_task_late_fulfill) {
+    // Included tasks are executed sequentially, no need to track
+    // synchronization
+    if (!FromTask->isIncluded()) {
+      // Task will finish before a barrier in the surrounding parallel region
+      // ...
+      ParallelData *PData = FromTask->Team;
+      TsanHappensBefore(
+          PData->GetBarrierPtr(FromTask->ImplicitTask->BarrierIndex));
+
+      // ... and before an eventual taskwait by the parent thread.
+      TsanHappensBefore(FromTask->Parent->GetTaskwaitPtr());
+
+      if (FromTask->TaskGroup != nullptr) {
+        // This task is part of a taskgroup, so it will finish before the
+        // corresponding taskgroup_end.
+        TsanHappensBefore(FromTask->TaskGroup->GetPtr());
+      }
+    }
+
+    // release dependencies
+    for (unsigned i = 0; i < FromTask->DependencyCount; i++) {
+      ompt_dependence_t *Dependency = &FromTask->Dependencies[i];
+
+      // in dependencies block following inout and out dependencies!
+      TsanHappensBefore(ToInAddr(Dependency->variable.ptr));
+      if (Dependency->dependence_type == ompt_dependence_type_out ||
+          Dependency->dependence_type == ompt_dependence_type_inout) {
+        TsanHappensBefore(Dependency->variable.ptr);
+      }
+    }
+    // free the previously running task
+    __ompt_tsan_release_task(FromTask);
+  }
+
+  // For late fulfill of detached task, there is no task to schedule to
+  if (prior_task_status == ompt_task_late_fulfill) {
+    return;
+  }
+
+  TaskData *ToTask = ToTaskData(second_task_data);
+  // Legacy handling for missing reduction callback
+  if (hasReductionCallback < ompt_set_always && ToTask->InBarrier) {
+    // We re-enter runtime code which currently performs a barrier.
+    TsanIgnoreWritesBegin();
+  }
+
+  // task suspended
+  if (prior_task_status == ompt_task_switch ||
+      prior_task_status == ompt_task_yield ||
+      prior_task_status == ompt_task_detach) {
+    // Task may be resumed at a later point in time.
+    TsanHappensBefore(FromTask->GetTaskPtr());
+    ToTask->ImplicitTask = FromTask->ImplicitTask;
+    assert(ToTask->ImplicitTask != NULL &&
+           "A task belongs to a team and has an implicit task on the stack");
+  }
+
+  // Handle dependencies on first execution of the task
+  if (ToTask->execution == 0) {
+    ToTask->execution++;
+    for (unsigned i = 0; i < ToTask->DependencyCount; i++) {
+      ompt_dependence_t *Dependency = &ToTask->Dependencies[i];
+
+      TsanHappensAfter(Dependency->variable.ptr);
+      // in and inout dependencies are also blocked by prior in dependencies!
+      if (Dependency->dependence_type == ompt_dependence_type_out ||
+          Dependency->dependence_type == ompt_dependence_type_inout) {
+        TsanHappensAfter(ToInAddr(Dependency->variable.ptr));
+      }
+    }
+  }
+  // 1. Task will begin execution after it has been created.
+  // 2. Task will resume after it has been switched away.
+  TsanHappensAfter(ToTask->GetTaskPtr());
+}
+
+static void ompt_tsan_dependences(ompt_data_t *task_data,
+                                  const ompt_dependence_t *deps, int ndeps) {
+  if (ndeps > 0) {
+    // Copy the data to use it in task_switch and task_end.
+    TaskData *Data = ToTaskData(task_data);
+    Data->Dependencies = new ompt_dependence_t[ndeps];
+    std::memcpy(Data->Dependencies, deps, sizeof(ompt_dependence_t) * ndeps);
+    Data->DependencyCount = ndeps;
+
+    // This callback is executed before this task is first started.
+    TsanHappensBefore(Data->GetTaskPtr());
+  }
+}
+
+/// OMPT event callbacks for handling locking.
+static void ompt_tsan_mutex_acquired(ompt_mutex_t kind, ompt_wait_id_t wait_id,
+                                     const void *codeptr_ra) {
+
+  // Acquire our own lock to make sure that
+  // 1. the previous release has finished.
+  // 2. the next acquire doesn't start before we have finished our release.
+  LocksMutex.lock();
+  std::mutex &Lock = Locks[wait_id];
+  LocksMutex.unlock();
+
+  Lock.lock();
+  TsanHappensAfter(&Lock);
+}
+
+static void ompt_tsan_mutex_released(ompt_mutex_t kind, ompt_wait_id_t wait_id,
+                                     const void *codeptr_ra) {
+  LocksMutex.lock();
+  std::mutex &Lock = Locks[wait_id];
+  LocksMutex.unlock();
+  TsanHappensBefore(&Lock);
+
+  Lock.unlock();
+}
+
+// callback , signature , variable to store result , required support level
+#define SET_OPTIONAL_CALLBACK_T(event, type, result, level)                    \
+  do {                                                                         \
+    ompt_callback_##type##_t tsan_##event = &ompt_tsan_##event;                \
+    result = ompt_set_callback(ompt_callback_##event,                          \
+                               (ompt_callback_t)tsan_##event);                 \
+    if (result < level)                                                        \
+      printf("Registered callback '" #event "' is not supported at " #level    \
+             " (%i)\n",                                                        \
+             result);                                                          \
+  } while (0)
+
+#define SET_CALLBACK_T(event, type)                                            \
+  do {                                                                         \
+    int res;                                                                   \
+    SET_OPTIONAL_CALLBACK_T(event, type, res, ompt_set_always);                \
+  } while (0)
+
+#define SET_CALLBACK(event) SET_CALLBACK_T(event, event)
+
+static int ompt_tsan_initialize(ompt_function_lookup_t lookup, int device_num,
+                                ompt_data_t *tool_data) {
+  const char *options = getenv("TSAN_OPTIONS");
+  TsanFlags tsan_flags(options);
+
+  ompt_set_callback_t ompt_set_callback =
+      (ompt_set_callback_t)lookup("ompt_set_callback");
+  if (ompt_set_callback == NULL) {
+    std::cerr << "Could not set callback, exiting..." << std::endl;
+    std::exit(1);
+  }
+  ompt_get_parallel_info =
+      (ompt_get_parallel_info_t)lookup("ompt_get_parallel_info");
+  ompt_get_thread_data = (ompt_get_thread_data_t)lookup("ompt_get_thread_data");
+
+  if (ompt_get_parallel_info == NULL) {
+    fprintf(stderr, "Could not get inquiry function 'ompt_get_parallel_info', "
+                    "exiting...\n");
+    exit(1);
+  }
+
+  SET_CALLBACK(thread_begin);
+  SET_CALLBACK(thread_end);
+  SET_CALLBACK(parallel_begin);
+  SET_CALLBACK(implicit_task);
+  SET_CALLBACK(sync_region);
+  SET_CALLBACK(parallel_end);
+
+  SET_CALLBACK(task_create);
+  SET_CALLBACK(task_schedule);
+  SET_CALLBACK(dependences);
+
+  SET_CALLBACK_T(mutex_acquired, mutex);
+  SET_CALLBACK_T(mutex_released, mutex);
+  SET_OPTIONAL_CALLBACK_T(reduction, sync_region, hasReductionCallback,
+                          ompt_set_never);
+
+  if (!tsan_flags.ignore_noninstrumented_modules)
+    fprintf(stderr,
+            "Warning: please export "
+            "TSAN_OPTIONS='ignore_noninstrumented_modules=1' "
+            "to avoid false positive reports from the OpenMP runtime!\n");
+  if (archer_flags->ignore_serial)
+    TsanIgnoreWritesBegin();
+  return 1; // success
+}
+
+static void ompt_tsan_finalize(ompt_data_t *tool_data) {
+  if (archer_flags->ignore_serial)
+    TsanIgnoreWritesEnd();
+  if (archer_flags->print_max_rss) {
+    struct rusage end;
+    getrusage(RUSAGE_SELF, &end);
+    printf("MAX RSS[KBytes] during execution: %ld\n", end.ru_maxrss);
+  }
+
+  if (archer_flags)
+    delete archer_flags;
+}
+
+extern "C" ompt_start_tool_result_t *
+ompt_start_tool(unsigned int omp_version, const char *runtime_version) {
+  const char *options = getenv("ARCHER_OPTIONS");
+  archer_flags = new ArcherFlags(options);
+  if (!archer_flags->enabled) {
+    if (archer_flags->verbose)
+      std::cout << "Archer disabled, stopping operation" << std::endl;
+    delete archer_flags;
+    return NULL;
+  }
+
+  static ompt_start_tool_result_t ompt_start_tool_result = {
+      &ompt_tsan_initialize, &ompt_tsan_finalize, {0}};
+  runOnTsan = 1;
+  RunningOnValgrind();
+  if (!runOnTsan) // if we are not running on TSAN, give a different tool the
+                  // chance to be loaded
+  {
+    if (archer_flags->verbose)
+      std::cout << "Archer detected OpenMP application without TSan "
+                   "stopping operation"
+                << std::endl;
+    delete archer_flags;
+    return NULL;
+  }
+
+  if (archer_flags->verbose)
+    std::cout << "Archer detected OpenMP application with TSan, supplying "
+                 "OpenMP synchronization semantics"
+              << std::endl;
+  return &ompt_start_tool_result;
+}
diff --git a/tools/archer/tests/CMakeLists.txt b/tools/archer/tests/CMakeLists.txt
new file mode 100644
index 000000000..79ff6ba99
--- /dev/null
+++ b/tools/archer/tests/CMakeLists.txt
@@ -0,0 +1,42 @@
+# CMakeLists.txt file for unit testing Archer runtime library.
+include(CheckFunctionExists)
+include(CheckLibraryExists)
+
+# When using libgcc, -latomic may be needed for atomics
+# (but when using compiler-rt, the atomics will be built-in)
+# Note: we can not check for __atomic_load because clang treats it
+# as special built-in and that breaks CMake checks
+check_function_exists(__atomic_load_1 LIBARCHER_HAVE_BUILTIN_ATOMIC)
+if(NOT LIBARCHER_HAVE_BUILTIN_ATOMIC)
+  check_library_exists(atomic __atomic_load_1 "" LIBARCHER_HAVE_LIBATOMIC)
+else()
+  # not needed
+  set(LIBARCHER_HAVE_LIBATOMIC 0)
+endif()
+
+set(LIBARCHER_TEST_PATH ${CMAKE_CURRENT_SOURCE_DIR})
+
+set(LIBARCHER_TEST_FLAGS "" CACHE STRING
+  "Extra compiler flags to send to the test compiler.")
+
+macro(pythonize_bool var)
+  if (${var})
+    set(${var} True)
+  else()
+    set(${var} False)
+  endif()
+endmacro()
+
+pythonize_bool(LIBARCHER_HAVE_LIBATOMIC)
+pythonize_bool(OPENMP_TEST_COMPILER_HAS_TSAN_FLAGS)
+
+set(ARCHER_TSAN_TEST_DEPENDENCE "")
+if(TARGET tsan)
+  set(ARCHER_TSAN_TEST_DEPENDENCE tsan)
+endif()
+
+add_openmp_testsuite(check-bolt-libarcher "Running libarcher tests" ${CMAKE_CURRENT_BINARY_DIR} DEPENDS bolt-archer bolt-omp ${ARCHER_TSAN_TEST_DEPENDENCE})
+
+# Configure the lit.site.cfg.in file
+set(AUTO_GEN_COMMENT "## Autogenerated by libarcher configuration.\n# Do not edit!")
+configure_file(lit.site.cfg.in lit.site.cfg @ONLY)
diff --git a/tools/archer/tests/barrier/barrier.c b/tools/archer/tests/barrier/barrier.c
new file mode 100644
index 000000000..f2f938d9b
--- /dev/null
+++ b/tools/archer/tests/barrier/barrier.c
@@ -0,0 +1,42 @@
+/*
+ * barrier.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: %libarcher-compile-and-run | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+
+#pragma omp parallel num_threads(2) shared(var)
+  {
+    if (omp_get_thread_num() == 0) {
+      var++;
+    }
+
+#pragma omp barrier
+
+    if (omp_get_thread_num() == 1) {
+      var++;
+    }
+  }
+
+  fprintf(stderr, "DONE\n");
+  int error = (var != 2);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/tools/archer/tests/critical/critical.c b/tools/archer/tests/critical/critical.c
new file mode 100644
index 000000000..2fc75453f
--- /dev/null
+++ b/tools/archer/tests/critical/critical.c
@@ -0,0 +1,36 @@
+/*
+ * critical.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+// RUN: %libarcher-compile-and-run | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+
+#pragma omp parallel num_threads(8) shared(var)
+  {
+#pragma omp critical
+    { var++; }
+  }
+
+  fprintf(stderr, "DONE\n");
+  int error = (var != 8);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/tools/archer/tests/critical/lock-nested.c b/tools/archer/tests/critical/lock-nested.c
new file mode 100644
index 000000000..3174aed6a
--- /dev/null
+++ b/tools/archer/tests/critical/lock-nested.c
@@ -0,0 +1,44 @@
+/*
+ * lock-nested.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+// RUN: %libarcher-compile-and-run | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+
+  omp_nest_lock_t lock;
+  omp_init_nest_lock(&lock);
+
+#pragma omp parallel num_threads(2) shared(var)
+  {
+    omp_set_nest_lock(&lock);
+    omp_set_nest_lock(&lock);
+    var++;
+    omp_unset_nest_lock(&lock);
+    omp_unset_nest_lock(&lock);
+  }
+
+  omp_destroy_nest_lock(&lock);
+
+  fprintf(stderr, "DONE\n");
+  int error = (var != 2);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/tools/archer/tests/critical/lock.c b/tools/archer/tests/critical/lock.c
new file mode 100644
index 000000000..c4157ae3a
--- /dev/null
+++ b/tools/archer/tests/critical/lock.c
@@ -0,0 +1,42 @@
+/*
+ * lock.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+// RUN: %libarcher-compile-and-run | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+
+  omp_lock_t lock;
+  omp_init_lock(&lock);
+
+#pragma omp parallel num_threads(2) shared(var)
+  {
+    omp_set_lock(&lock);
+    var++;
+    omp_unset_lock(&lock);
+  }
+
+  omp_destroy_lock(&lock);
+
+  fprintf(stderr, "DONE\n");
+  int error = (var != 2);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/tools/archer/tests/deflake.bash b/tools/archer/tests/deflake.bash
new file mode 100755
index 000000000..5700c134e
--- /dev/null
+++ b/tools/archer/tests/deflake.bash
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# This script is used to deflake inherently flaky archer tests.
+# It is invoked from lit tests as:
+# %deflake mybinary
+# which is then substituted by lit to:
+# $(dirname %s)/deflake.bash mybinary
+# The script runs the target program up to 10 times,
+# until it fails (i.e. produces a race report).
+
+for i in $(seq 1 10); do
+    OUT=`$@ 2>&1`
+    if [[ $? != 0 ]]; then
+	echo "$OUT"
+	exit 0
+    fi
+done
+exit 1
diff --git a/tools/archer/tests/lit.cfg b/tools/archer/tests/lit.cfg
new file mode 100644
index 000000000..384a85cf6
--- /dev/null
+++ b/tools/archer/tests/lit.cfg
@@ -0,0 +1,135 @@
+# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79:
+# Configuration file for the 'lit' test runner.
+
+import os
+import re
+import subprocess
+import lit.formats
+
+# Tell pylint that we know config and lit_config exist somewhere.
+if 'PYLINT_IMPORT' in os.environ:
+    config = object()
+    lit_config = object()
+
+def append_dynamic_library_path(path):
+    if config.operating_system == 'Windows':
+        name = 'PATH'
+        sep = ';'
+    elif config.operating_system == 'Darwin':
+        name = 'DYLD_LIBRARY_PATH'
+        sep = ':'
+    else:
+        name = 'LD_LIBRARY_PATH'
+        sep = ':'
+    if name in config.environment:
+        config.environment[name] = path + sep + config.environment[name]
+    else:
+        config.environment[name] = path
+
+# name: The name of this test suite.
+config.name = 'bolt-libarcher'
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.c', '.cpp']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root object directory where output is placed
+config.test_exec_root = config.libarcher_obj_root
+
+# test format
+config.test_format = lit.formats.ShTest()
+
+# compiler flags
+config.test_flags = " -I " + config.test_source_root + \
+    " -I " + config.omp_header_dir + \
+    " -L " + config.omp_library_dir + \
+    " -Wl,-rpath," + config.omp_library_dir + \
+    " " + config.test_archer_flags + \
+    " " + config.test_extra_flags
+
+config.archer_flags = "-g -O1 -fsanitize=thread"
+
+
+# extra libraries
+libs = ""
+if config.has_libatomic:
+    libs += " -latomic"
+
+# Allow XFAIL to work
+config.target_triple = [ ]
+for feature in config.test_compiler_features:
+    config.available_features.add(feature)
+
+# Setup environment to find dynamic library at runtime
+append_dynamic_library_path(config.omp_library_dir)
+append_dynamic_library_path(config.libarcher_obj_root+"/..")
+
+# Rpath modifications for Darwin
+if config.operating_system == 'Darwin':
+    config.test_flags += " -Wl,-rpath," + config.omp_library_dir
+
+# Find the SDK on Darwin
+if config.operating_system == 'Darwin':
+  cmd = subprocess.Popen(['xcrun', '--show-sdk-path'],
+                         stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+  out, err = cmd.communicate()
+  out = out.strip()
+  res = cmd.wait()
+  if res == 0 and out:
+    config.test_flags += " -isysroot " + out
+
+if 'Linux' in config.operating_system:
+    config.available_features.add("linux")
+
+if config.has_tsan == True:
+    config.available_features.add("tsan")
+
+# to run with icc INTEL_LICENSE_FILE must be set
+if 'INTEL_LICENSE_FILE' in os.environ:
+    config.environment['INTEL_LICENSE_FILE'] = os.environ['INTEL_LICENSE_FILE']
+
+# Setup flags for BOLT.  If BOLT is not used, they are ignored.
+# Some tasking tests require larger stack size.
+config.environment['ABT_THREAD_STACKSIZE'] = "262144"
+# Sleep alleviates oversubscription overheads when -j is specified.
+config.environment['KMP_ABT_SCHED_SLEEP'] = "1"
+
+# Race Tests
+config.substitutions.append(("%libarcher-compile-and-run-race-noserial", \
+    "%libarcher-compile && env ARCHER_OPTIONS=ignore_serial=1 %libarcher-run-race"))
+config.substitutions.append(("%libarcher-compile-and-run-race", \
+    "%libarcher-compile && %libarcher-run-race"))
+config.substitutions.append(("%libarcher-compile-and-run-nosuppression", \
+                             "%libarcher-compile && %libarcher-run-nosuppression"))
+config.substitutions.append(("%libarcher-compile-and-run", \
+                             "%libarcher-compile && %libarcher-run"))
+config.substitutions.append(("%libarcher-cxx-compile-and-run", \
+    "%libarcher-cxx-compile && %libarcher-run"))
+config.substitutions.append(("%libarcher-cxx-compile", \
+    "%clang-archerXX %openmp_flags %archer_flags %flags -std=c++14 %s -o %t" + libs))
+config.substitutions.append(("%libarcher-compile", \
+                             "%clang-archer %openmp_flags %archer_flags %flags %s -o %t" + libs))
+config.substitutions.append(("%libarcher-run-race", "%suppression %deflake %t 2>&1 | tee %t.log"))
+config.substitutions.append(("%libarcher-run-nosuppression", "%nosuppression %t 2>&1 | tee %t.log"))
+config.substitutions.append(("%libarcher-run", "%suppression %t 2>&1 | tee %t.log"))
+config.substitutions.append(("%clang-archerXX", config.test_cxx_compiler))
+config.substitutions.append(("%clang-archer", config.test_c_compiler))
+config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
+config.substitutions.append(("%archer_flags", config.archer_flags))
+config.substitutions.append(("%flags", config.test_flags))
+config.substitutions.append(("%nosuppression", "env TSAN_OPTIONS='ignore_noninstrumented_modules=0:exitcode=0'"))
+config.substitutions.append(("%suppression", "env TSAN_OPTIONS='ignore_noninstrumented_modules=0:ignore_noninstrumented_modules=1'"))
+config.substitutions.append(("%deflake", os.path.join(os.path.dirname(__file__), "deflake.bash")))
+
+config.substitutions.append(("FileCheck", config.test_filecheck))
+config.substitutions.append(("%not", config.test_not))
+config.substitutions.append(("%sort-threads", "sort --numeric-sort --stable"))
+if config.operating_system == 'Windows':
+    # No such environment variable on Windows.
+    config.substitutions.append(("%preload-tool", "true ||"))
+elif config.operating_system == 'Darwin':
+    config.substitutions.append(("%preload-tool", "env DYLD_INSERT_LIBRARIES=%T/tool.so"))
+else:
+    config.substitutions.append(("%preload-tool", "env LD_PRELOAD=%T/tool.so"))
diff --git a/tools/archer/tests/lit.site.cfg.in b/tools/archer/tests/lit.site.cfg.in
new file mode 100644
index 000000000..7010badf0
--- /dev/null
+++ b/tools/archer/tests/lit.site.cfg.in
@@ -0,0 +1,21 @@
+@AUTO_GEN_COMMENT@
+
+config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
+config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
+config.test_compiler_features = @OPENMP_TEST_COMPILER_FEATURES@
+config.test_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
+config.test_not = "@OPENMP_NOT_EXECUTABLE@"
+config.test_openmp_flags = "@OPENMP_TEST_OPENMP_FLAGS@"
+config.test_extra_flags = "@OPENMP_TEST_FLAGS@"
+config.libomp_obj_root = "@CMAKE_CURRENT_BINARY_DIR@"
+config.omp_library_dir = "@LIBBOLT_LIBRARY_DIR@"
+config.omp_header_dir = "@LIBBOLT_INCLUDE_DIR@"
+config.operating_system = "@CMAKE_SYSTEM_NAME@"
+config.has_libatomic = @LIBARCHER_HAVE_LIBATOMIC@
+config.has_tsan = @OPENMP_TEST_COMPILER_HAS_TSAN_FLAGS@
+
+config.test_archer_flags = "@LIBARCHER_TEST_FLAGS@"
+config.libarcher_obj_root = "@CMAKE_CURRENT_BINARY_DIR@"
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")
diff --git a/tools/archer/tests/ompt/ompt-signal.h b/tools/archer/tests/ompt/ompt-signal.h
new file mode 100644
index 000000000..6b605e3a4
--- /dev/null
+++ b/tools/archer/tests/ompt/ompt-signal.h
@@ -0,0 +1,42 @@
+/*
+ * ompt-signal.h -- Header providing low-level synchronization for tests
+ */
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is a copy from runtime/test/ompt/
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#define delay() Sleep(1);
+#else
+#include <unistd.h>
+#define delay(t) usleep(t);
+#endif
+
+// These functions are used to provide a signal-wait mechanism to enforce
+// expected scheduling for the test cases.
+// Conditional variable (s) needs to be shared! Initialize to 0
+
+#define OMPT_SIGNAL(s) ompt_signal(&s)
+// inline
+void ompt_signal(int *s) {
+#pragma omp atomic
+  (*s)++;
+}
+
+#define OMPT_WAIT(s, v) ompt_wait(&s, v)
+// wait for s >= v
+// inline
+void ompt_wait(int *s, int v) {
+  int wait = 0;
+  do {
+    delay(10);
+#pragma omp atomic read
+    wait = (*s);
+  } while (wait < v);
+}
diff --git a/tools/archer/tests/parallel/parallel-firstprivate.c b/tools/archer/tests/parallel/parallel-firstprivate.c
new file mode 100644
index 000000000..97e8fcb52
--- /dev/null
+++ b/tools/archer/tests/parallel/parallel-firstprivate.c
@@ -0,0 +1,33 @@
+/*
+ * parallel-firstprivate.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+// RUN: %libarcher-compile-and-run | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+
+#pragma omp parallel num_threads(2) firstprivate(var)
+  { var = 1; }
+
+  fprintf(stderr, "DONE\n");
+  // var should still be 0!
+  return var;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/tools/archer/tests/parallel/parallel-nosuppression.c b/tools/archer/tests/parallel/parallel-nosuppression.c
new file mode 100644
index 000000000..de46ace01
--- /dev/null
+++ b/tools/archer/tests/parallel/parallel-nosuppression.c
@@ -0,0 +1,38 @@
+/*
+ * parallel-nosuppression.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+// RUN: %libarcher-compile-and-run-nosuppression | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+
+#pragma omp parallel num_threads(2) shared(var)
+  {
+    if (omp_get_thread_num() == 1) {
+      var++;
+    }
+  } // implicit barrier
+
+  var++;
+
+  fprintf(stderr, "DONE\n");
+  int error = (var != 2);
+  return error;
+}
+
+// CHECK: Warning: please export TSAN_OPTIONS
+// CHECK: DONE
diff --git a/tools/archer/tests/parallel/parallel-simple.c b/tools/archer/tests/parallel/parallel-simple.c
new file mode 100644
index 000000000..dff410bd8
--- /dev/null
+++ b/tools/archer/tests/parallel/parallel-simple.c
@@ -0,0 +1,60 @@
+/*
+ * parallel-simple.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+// RUN: %libarcher-compile && env OMP_TOOL_VERBOSE_INIT=stderr %libarcher-run 2>&1 | FileCheck %s --check-prefixes CHECK,TSAN_ON
+// RUN: %clang-archer %openmp_flags %flags %s -o %t && env OMP_TOOL_VERBOSE_INIT=stderr %t 2>&1 | FileCheck %s --check-prefixes CHECK,TSAN_OFF
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+// TSAN_ON: ----- START LOGGING OF TOOL REGISTRATION -----
+// TSAN_ON-NEXT: Search for OMP tool in current address space... Failed.
+// TSAN_ON-NEXT: No OMP_TOOL_LIBRARIES defined.
+// TSAN_ON-NEXT: ...searching tool libraries failed. Using archer tool.
+// TSAN_ON-NEXT: Opening libarcher.so... Success.
+// TSAN_ON-NEXT: Searching for ompt_start_tool in libarcher.so... Success.
+// TSAN_ON-NEXT: Tool was started and is using the OMPT interface.
+// TSAN_ON-NEXT: ----- END LOGGING OF TOOL REGISTRATION -----
+
+// TSAN_OFF: ----- START LOGGING OF TOOL REGISTRATION -----
+// TSAN_OFF-NEXT: Search for OMP tool in current address space... Failed.
+// TSAN_OFF-NEXT: No OMP_TOOL_LIBRARIES defined.
+// TSAN_OFF-NEXT: ...searching tool libraries failed. Using archer tool.
+// TSAN_OFF-NEXT: Opening libarcher.so... Success.
+// TSAN_OFF-NEXT: Searching for ompt_start_tool in libarcher.so... Found but not using the OMPT interface.
+// TSAN_OFF-NEXT: No OMP tool loaded.
+// TSAN_OFF-NEXT: ----- END LOGGING OF TOOL REGISTRATION -----
+
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+
+#pragma omp parallel num_threads(2) shared(var)
+  {
+    if (omp_get_thread_num() == 1) {
+      var++;
+    }
+  } // implicit barrier
+
+  var++;
+
+  fprintf(stderr, "DONE\n");
+  int error = (var != 2);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK-NOT: Warning: please export TSAN_OPTIONS
+// CHECK: DONE
diff --git a/tools/archer/tests/parallel/parallel-simple2.c b/tools/archer/tests/parallel/parallel-simple2.c
new file mode 100644
index 000000000..4663998a5
--- /dev/null
+++ b/tools/archer/tests/parallel/parallel-simple2.c
@@ -0,0 +1,44 @@
+/*
+ * parallel-simple2.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+// RUN: %libarcher-compile-and-run  | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+
+// Create team of threads so that there is no implicit happens before
+// when creating the thread.
+#pragma omp parallel num_threads(2)
+  {}
+
+  var++;
+
+#pragma omp parallel num_threads(2) shared(var)
+  {
+    if (omp_get_thread_num() == 1) {
+      var++;
+    }
+  } // implicit barrier
+
+  fprintf(stderr, "DONE\n");
+  int error = (var != 2);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/tools/archer/tests/races/critical-unrelated.c b/tools/archer/tests/races/critical-unrelated.c
new file mode 100644
index 000000000..af5a6d22a
--- /dev/null
+++ b/tools/archer/tests/races/critical-unrelated.c
@@ -0,0 +1,41 @@
+/*
+ * critical-unrelated.c -- Archer testcase
+ */
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: %libarcher-compile-and-run-race | FileCheck %s
+// RUN: %libarcher-compile-and-run-race-noserial | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+
+#pragma omp parallel num_threads(2) shared(var)
+  {
+#pragma omp critical
+    {
+      // Dummy region.
+    }
+
+    var++;
+  }
+
+  fprintf(stderr, "DONE\n");
+}
+
+// CHECK: WARNING: ThreadSanitizer: data race
+// CHECK-NEXT:   {{(Write|Read)}} of size 4
+// CHECK-NEXT: #0 {{.*}}critical-unrelated.c:29
+// CHECK:   Previous write of size 4
+// CHECK-NEXT: #0 {{.*}}critical-unrelated.c:29
+// CHECK: DONE
+// CHECK: ThreadSanitizer: reported 1 warnings
diff --git a/tools/archer/tests/races/lock-nested-unrelated.c b/tools/archer/tests/races/lock-nested-unrelated.c
new file mode 100644
index 000000000..37b96296c
--- /dev/null
+++ b/tools/archer/tests/races/lock-nested-unrelated.c
@@ -0,0 +1,47 @@
+/*
+ * lock-nested-unrelated.c -- Archer testcase
+ */
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: %libarcher-compile-and-run-race | FileCheck %s
+// RUN: %libarcher-compile-and-run-race-noserial | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+
+  omp_nest_lock_t lock;
+  omp_init_nest_lock(&lock);
+
+#pragma omp parallel num_threads(2) shared(var)
+  {
+    omp_set_nest_lock(&lock);
+    omp_set_nest_lock(&lock);
+    // Dummy locking.
+    omp_unset_nest_lock(&lock);
+    omp_unset_nest_lock(&lock);
+
+    var++;
+  }
+
+  omp_destroy_nest_lock(&lock);
+
+  fprintf(stderr, "DONE\n");
+}
+
+// CHECK: WARNING: ThreadSanitizer: data race
+// CHECK-NEXT:   {{(Write|Read)}} of size 4
+// CHECK-NEXT: #0 {{.*}}lock-nested-unrelated.c:33
+// CHECK:   Previous write of size 4
+// CHECK-NEXT: #0 {{.*}}lock-nested-unrelated.c:33
+// CHECK: DONE
+// CHECK: ThreadSanitizer: reported 1 warnings
diff --git a/tools/archer/tests/races/lock-unrelated.c b/tools/archer/tests/races/lock-unrelated.c
new file mode 100644
index 000000000..8086ffdb1
--- /dev/null
+++ b/tools/archer/tests/races/lock-unrelated.c
@@ -0,0 +1,47 @@
+/*
+ * lock-unrelated.c -- Archer testcase
+ */
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: %libarcher-compile-and-run-race | FileCheck %s
+// RUN: %libarcher-compile-and-run-race-noserial | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+
+  omp_lock_t lock;
+  omp_init_lock(&lock);
+
+#pragma omp parallel num_threads(2) shared(var)
+  {
+    omp_set_lock(&lock);
+    // Dummy locking.
+    omp_unset_lock(&lock);
+
+    var++;
+  }
+
+  omp_destroy_lock(&lock);
+
+  int error = (var != 2);
+  fprintf(stderr, "DONE\n");
+  return error;
+}
+
+// CHECK: WARNING: ThreadSanitizer: data race
+// CHECK-NEXT:   {{(Write|Read)}} of size 4
+// CHECK-NEXT: #0 {{.*}}lock-unrelated.c:31
+// CHECK:   Previous write of size 4
+// CHECK-NEXT: #0 {{.*}}lock-unrelated.c:31
+// CHECK: DONE
+// CHECK: ThreadSanitizer: reported 1 warnings
diff --git a/tools/archer/tests/races/parallel-simple.c b/tools/archer/tests/races/parallel-simple.c
new file mode 100644
index 000000000..009045ebb
--- /dev/null
+++ b/tools/archer/tests/races/parallel-simple.c
@@ -0,0 +1,36 @@
+/*
+ * parallel-simple.c -- Archer testcase
+ */
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: %libarcher-compile-and-run-race | FileCheck %s
+// RUN: %libarcher-compile-and-run-race-noserial | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+
+#pragma omp parallel num_threads(2) shared(var)
+  { var++; }
+
+  int error = (var != 2);
+  fprintf(stderr, "DONE\n");
+  return error;
+}
+
+// CHECK: WARNING: ThreadSanitizer: data race
+// CHECK-NEXT:   {{(Write|Read)}} of size 4
+// CHECK-NEXT: #0 {{.*}}parallel-simple.c:23
+// CHECK:   Previous write of size 4
+// CHECK-NEXT: #0 {{.*}}parallel-simple.c:23
+// CHECK: DONE
+// CHECK: ThreadSanitizer: reported 1 warnings
diff --git a/tools/archer/tests/races/task-dependency.c b/tools/archer/tests/races/task-dependency.c
new file mode 100644
index 000000000..d5e2188c2
--- /dev/null
+++ b/tools/archer/tests/races/task-dependency.c
@@ -0,0 +1,60 @@
+/*
+ * task-dependency.c -- Archer testcase
+ */
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: %libarcher-compile-and-run-race | FileCheck %s
+// RUN: %libarcher-compile-and-run-race-noserial | FileCheck %s
+// REQUIRES: tsan
+#include "ompt/ompt-signal.h"
+#include <omp.h>
+#include <stdio.h>
+#include <unistd.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0, a = 0;
+
+#pragma omp parallel num_threads(2) shared(var, a)
+#pragma omp master
+  {
+#pragma omp task shared(var, a) depend(out : var)
+    {
+      OMPT_SIGNAL(a);
+      var++;
+    }
+
+#pragma omp task shared(a) depend(in : var)
+    {
+      OMPT_SIGNAL(a);
+      OMPT_WAIT(a, 3);
+    }
+
+#pragma omp task shared(var) // depend(in: var) is missing here!
+    {
+      var++;
+      OMPT_SIGNAL(a);
+    }
+
+    // Give other thread time to steal the task.
+    OMPT_WAIT(a, 2);
+  }
+
+  int error = (var != 2);
+  fprintf(stderr, "DONE\n");
+  return error;
+}
+
+// CHECK: WARNING: ThreadSanitizer: data race
+// CHECK-NEXT:   {{(Write|Read)}} of size 4
+// CHECK-NEXT: #0 {{.*}}task-dependency.c:41
+// CHECK:   Previous write of size 4
+// CHECK-NEXT: #0 {{.*}}task-dependency.c:30
+// CHECK: DONE
+// CHECK: ThreadSanitizer: reported 1 warnings
diff --git a/tools/archer/tests/races/task-taskgroup-unrelated.c b/tools/archer/tests/races/task-taskgroup-unrelated.c
new file mode 100644
index 000000000..7bc03249e
--- /dev/null
+++ b/tools/archer/tests/races/task-taskgroup-unrelated.c
@@ -0,0 +1,61 @@
+/*
+ * task-taskgroup-unrelated.c -- Archer testcase
+ */
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: %libarcher-compile-and-run-race | FileCheck %s
+// RUN: %libarcher-compile-and-run-race-noserial | FileCheck %s
+// REQUIRES: tsan
+#include "ompt/ompt-signal.h"
+#include <omp.h>
+#include <stdio.h>
+#include <unistd.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0, a = 0;
+
+#pragma omp parallel num_threads(2) shared(var, a)
+#pragma omp master
+  {
+#pragma omp task shared(var, a)
+    {
+      var++;
+      OMPT_SIGNAL(a);
+      // Give master thread time to execute the task in the taskgroup.
+      OMPT_WAIT(a, 2);
+    }
+
+#pragma omp taskgroup
+    {
+#pragma omp task if (0)
+      {
+        // Dummy task.
+      }
+
+      // Give other threads time to steal the tasks.
+      OMPT_WAIT(a, 1);
+      OMPT_SIGNAL(a);
+    }
+
+    var++;
+  }
+
+  int error = (var != 2);
+  fprintf(stderr, "DONE\n");
+  return error;
+}
+
+// CHECK: WARNING: ThreadSanitizer: data race
+// CHECK-NEXT:   {{(Write|Read)}} of size 4
+// CHECK-NEXT: #0 {{.*}}task-taskgroup-unrelated.c:47
+// CHECK:   Previous write of size 4
+// CHECK-NEXT: #0 {{.*}}task-taskgroup-unrelated.c:29
+// CHECK: DONE
+// CHECK: ThreadSanitizer: reported 1 warnings
diff --git a/tools/archer/tests/races/task-taskwait-nested.c b/tools/archer/tests/races/task-taskwait-nested.c
new file mode 100644
index 000000000..29aac18ba
--- /dev/null
+++ b/tools/archer/tests/races/task-taskwait-nested.c
@@ -0,0 +1,58 @@
+/*
+ * task-taskwait-nested.c -- Archer testcase
+ */
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: %libarcher-compile-and-run-race | FileCheck %s
+// RUN: %libarcher-compile-and-run-race-noserial | FileCheck %s
+// REQUIRES: tsan
+#include "ompt/ompt-signal.h"
+#include <omp.h>
+#include <stdio.h>
+#include <unistd.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0, a = 0;
+
+#pragma omp parallel num_threads(2) shared(var, a)
+#pragma omp master
+  {
+#pragma omp task shared(var, a)
+    {
+#pragma omp task shared(var, a)
+      {
+        // wait for master to pass the taskwait
+        OMPT_SIGNAL(a);
+        OMPT_WAIT(a, 2);
+        var++;
+      }
+    }
+
+    // Give other thread time to steal the task and execute its child.
+    OMPT_WAIT(a, 1);
+
+// Only directly generated children are guaranteed to be executed.
+#pragma omp taskwait
+    OMPT_SIGNAL(a);
+    var++;
+  }
+
+  int error = (var != 2);
+  fprintf(stderr, "DONE\n");
+  return error;
+}
+
+// CHECK: WARNING: ThreadSanitizer: data race
+// CHECK-NEXT:   {{(Write|Read)}} of size 4
+// CHECK-NEXT: #0 {{.*}}task-taskwait-nested.c:34
+// CHECK:   Previous write of size 4
+// CHECK-NEXT: #0 {{.*}}task-taskwait-nested.c:44
+// CHECK: DONE
+// CHECK: ThreadSanitizer: reported 1 warnings
diff --git a/tools/archer/tests/races/task-two.c b/tools/archer/tests/races/task-two.c
new file mode 100644
index 000000000..281269fc4
--- /dev/null
+++ b/tools/archer/tests/races/task-two.c
@@ -0,0 +1,44 @@
+/*
+ * task-two.c -- Archer testcase
+ */
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: %libarcher-compile-and-run-race | FileCheck %s
+// RUN: %libarcher-compile-and-run-race-noserial | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#define NUM_THREADS 2
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+  int i;
+
+#pragma omp parallel for num_threads(NUM_THREADS) shared(var)                  \
+    schedule(static, 1)
+  for (i = 0; i < NUM_THREADS; i++) {
+#pragma omp task shared(var) if (0) // the task is inlined an executed locally
+    { var++; }
+  }
+
+  int error = (var != 2);
+  fprintf(stderr, "DONE\n");
+  return error;
+}
+
+// CHECK: WARNING: ThreadSanitizer: data race
+// CHECK-NEXT:   {{(Write|Read)}} of size 4
+// CHECK-NEXT: #0 {{.*}}task-two.c:30
+// CHECK:   Previous write of size 4
+// CHECK-NEXT: #0 {{.*}}task-two.c:30
+// CHECK: DONE
+// CHECK: ThreadSanitizer: reported 1 warnings
diff --git a/tools/archer/tests/reduction/parallel-reduction-nowait.c b/tools/archer/tests/reduction/parallel-reduction-nowait.c
new file mode 100644
index 000000000..b91579f0b
--- /dev/null
+++ b/tools/archer/tests/reduction/parallel-reduction-nowait.c
@@ -0,0 +1,46 @@
+/*
+ * parallel-reduction-nowait.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+// RUN: %libarcher-compile-and-run | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0, i;
+  int sum1 = 0;
+  int sum2 = 0;
+
+// Number of threads is empirical: We need enough threads so that
+// the reduction is really performed hierarchically in the barrier!
+#pragma omp parallel num_threads(5) reduction(+ : var)
+  {
+#pragma omp for schedule(static) nowait reduction(+ : sum1)
+    for (i = 0; i < 5; i++)
+      sum1 += i;
+#pragma omp for schedule(static) reduction(+ : sum2)
+    for (i = 0; i < 5; i++)
+      sum2 += i;
+
+    var = sum1 + sum2;
+  }
+
+  fprintf(stderr, "DONE\n");
+  int error = (var != 100);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/tools/archer/tests/reduction/parallel-reduction.c b/tools/archer/tests/reduction/parallel-reduction.c
new file mode 100644
index 000000000..6d1a556ac
--- /dev/null
+++ b/tools/archer/tests/reduction/parallel-reduction.c
@@ -0,0 +1,35 @@
+/*
+ * parallel-reduction.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+// RUN: %libarcher-compile-and-run| FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+
+// Number of threads is empirical: We need enough threads so that
+// the reduction is really performed hierarchically in the barrier!
+#pragma omp parallel num_threads(5) reduction(+ : var)
+  { var = 1; }
+
+  fprintf(stderr, "DONE\n");
+  int error = (var != 5);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/tools/archer/tests/task/task-barrier.c b/tools/archer/tests/task/task-barrier.c
new file mode 100644
index 000000000..23e597cea
--- /dev/null
+++ b/tools/archer/tests/task/task-barrier.c
@@ -0,0 +1,52 @@
+/*
+ * task-barrier.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+// RUN: %libarcher-compile-and-run | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "ompt/ompt-signal.h"
+
+int main(int argc, char *argv[]) {
+  int var = 0, a = 0;
+
+#pragma omp parallel num_threads(2) shared(var, a)
+  {
+#pragma omp master
+    {
+#pragma omp task shared(var)
+      {
+        OMPT_SIGNAL(a);
+        var++;
+      }
+
+      // Give other thread time to steal the task.
+      OMPT_WAIT(a, 1);
+    }
+
+#pragma omp barrier
+
+#pragma omp master
+    { var++; }
+  }
+
+  fprintf(stderr, "DONE\n");
+  int error = (var != 2);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/tools/archer/tests/task/task-create.c b/tools/archer/tests/task/task-create.c
new file mode 100644
index 000000000..700bb335e
--- /dev/null
+++ b/tools/archer/tests/task/task-create.c
@@ -0,0 +1,46 @@
+/*
+ * task-create.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+// RUN: %libarcher-compile-and-run | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "ompt/ompt-signal.h"
+
+int main(int argc, char *argv[]) {
+  int var = 0, a = 0;
+
+#pragma omp parallel num_threads(2) shared(var, a)
+#pragma omp master
+  {
+    var++;
+#pragma omp task shared(var, a)
+    {
+      var++;
+      OMPT_SIGNAL(a);
+    }
+
+    // Give other thread time to steal the task.
+    OMPT_WAIT(a, 1);
+  }
+
+  fprintf(stderr, "DONE\n");
+  int error = (var != 2);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/tools/archer/tests/task/task-dependency.c b/tools/archer/tests/task/task-dependency.c
new file mode 100644
index 000000000..a7a2a669c
--- /dev/null
+++ b/tools/archer/tests/task/task-dependency.c
@@ -0,0 +1,54 @@
+/*
+ * task-dependency.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+// RUN: %libarcher-compile-and-run | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "ompt/ompt-signal.h"
+
+int main(int argc, char *argv[]) {
+  int var = 0, a = 0;
+
+#pragma omp parallel num_threads(2) shared(var, a)
+#pragma omp master
+  {
+#pragma omp task shared(var, a) depend(out : var)
+    {
+      var++;
+      OMPT_SIGNAL(a);
+    }
+
+#pragma omp task shared(var, a) depend(in : var)
+    { OMPT_WAIT(a, 2); }
+
+#pragma omp task shared(var, a) depend(in : var)
+    {
+      OMPT_SIGNAL(a);
+      var++;
+    }
+
+    // Give other thread time to steal the task.
+    OMPT_WAIT(a, 1);
+  }
+
+  fprintf(stderr, "DONE\n");
+  int error = (var != 2);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/tools/archer/tests/task/task-taskgroup-nested.c b/tools/archer/tests/task/task-taskgroup-nested.c
new file mode 100644
index 000000000..c82b6be3f
--- /dev/null
+++ b/tools/archer/tests/task/task-taskgroup-nested.c
@@ -0,0 +1,53 @@
+/*
+ * task-taskgroup-nested.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+// RUN: %libarcher-compile-and-run | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "ompt/ompt-signal.h"
+
+int main(int argc, char *argv[]) {
+  int var = 0, a = 0;
+
+#pragma omp parallel num_threads(2) shared(var, a)
+#pragma omp master
+  {
+#pragma omp taskgroup
+    {
+#pragma omp task
+      {
+#pragma omp task shared(var, a)
+        {
+          var++;
+          OMPT_SIGNAL(a);
+        }
+      }
+
+      // Give other thread time to steal the task and execute its child.
+      OMPT_WAIT(a, 1);
+    }
+
+    var++;
+  }
+
+  fprintf(stderr, "DONE\n");
+  int error = (var != 2);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/tools/archer/tests/task/task-taskgroup.c b/tools/archer/tests/task/task-taskgroup.c
new file mode 100644
index 000000000..799bd22dd
--- /dev/null
+++ b/tools/archer/tests/task/task-taskgroup.c
@@ -0,0 +1,50 @@
+/*
+ * task-taskgroup.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+// RUN: %libarcher-compile-and-run | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "ompt/ompt-signal.h"
+
+int main(int argc, char *argv[]) {
+  int var = 0, a = 0;
+
+#pragma omp parallel num_threads(2) shared(var, a)
+#pragma omp master
+  {
+#pragma omp taskgroup
+    {
+#pragma omp task shared(var, a)
+      {
+        var++;
+        OMPT_SIGNAL(a);
+      }
+
+      // Give other thread time to steal the task.
+      OMPT_WAIT(a, 1);
+    }
+
+    var++;
+  }
+
+  fprintf(stderr, "DONE\n");
+  int error = (var != 2);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/tools/archer/tests/task/task-taskwait-nested.c b/tools/archer/tests/task/task-taskwait-nested.c
new file mode 100644
index 000000000..fe3fb2787
--- /dev/null
+++ b/tools/archer/tests/task/task-taskwait-nested.c
@@ -0,0 +1,53 @@
+/*
+ * task-taskwait-nested.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+// RUN: %libarcher-compile-and-run | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "ompt/ompt-signal.h"
+
+int main(int argc, char *argv[]) {
+  int var = 0, a = 0;
+
+#pragma omp parallel num_threads(2) shared(var, a)
+#pragma omp master
+  {
+#pragma omp task
+    {
+#pragma omp task shared(var, a)
+      {
+        OMPT_SIGNAL(a);
+        delay(100);
+        var++;
+      }
+#pragma omp taskwait
+    }
+
+    // Give other thread time to steal the task and execute its child.
+    OMPT_WAIT(a, 1);
+
+#pragma omp taskwait
+    var++;
+  }
+
+  fprintf(stderr, "DONE\n");
+  int error = (var != 2);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/tools/archer/tests/task/task-taskwait.c b/tools/archer/tests/task/task-taskwait.c
new file mode 100644
index 000000000..af334dc31
--- /dev/null
+++ b/tools/archer/tests/task/task-taskwait.c
@@ -0,0 +1,50 @@
+/*
+ * task-taskwait.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+// RUN: %libarcher-compile-and-run | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "ompt/ompt-signal.h"
+
+int main(int argc, char *argv[]) {
+  int var = 0, a = 0;
+
+#pragma omp parallel num_threads(2) shared(var, a)
+#pragma omp master
+  {
+#pragma omp task shared(var, a)
+    {
+      OMPT_SIGNAL(a);
+      OMPT_WAIT(a, 2);
+      delay(100);
+      var++;
+    }
+
+    // Give other thread time to steal the task.
+    OMPT_WAIT(a, 1);
+    OMPT_SIGNAL(a);
+#pragma omp taskwait
+    var++;
+  }
+
+  fprintf(stderr, "DONE\n");
+  int error = (var != 2);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/tools/archer/tests/task/task_early_fulfill.c b/tools/archer/tests/task/task_early_fulfill.c
new file mode 100644
index 000000000..0990b36e4
--- /dev/null
+++ b/tools/archer/tests/task/task_early_fulfill.c
@@ -0,0 +1,26 @@
+// RUN: %libarcher-compile -fopenmp-version=50 && env OMP_NUM_THREADS='3' \
+// RUN:    %libarcher-run
+//| FileCheck %s
+
+// Checked gcc 9.2 still does not support detach clause on task construct.
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7, gcc-8, gcc-9
+// clang supports detach clause since version 11.
+// UNSUPPORTED: clang-10, clang-9, clang-8, clang-7
+// icc compiler does not support detach clause.
+// UNSUPPORTED: icc
+// REQUIRES: tsan
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+#pragma omp parallel
+#pragma omp master
+  {
+    omp_event_handle_t event;
+#pragma omp task detach(event) if (0)
+    { omp_fulfill_event(event); }
+#pragma omp taskwait
+  }
+  return 0;
+}
diff --git a/tools/archer/tests/task/task_late_fulfill.c b/tools/archer/tests/task/task_late_fulfill.c
new file mode 100644
index 000000000..d27409245
--- /dev/null
+++ b/tools/archer/tests/task/task_late_fulfill.c
@@ -0,0 +1,54 @@
+// RUN: %libarcher-compile -fopenmp-version=50 && env OMP_NUM_THREADS='3' \
+// RUN:   %libarcher-run-race | FileCheck %s
+
+// Checked gcc 9.2 still does not support detach clause on task construct.
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7, gcc-8, gcc-9
+// clang supports detach clause since version 11.
+// UNSUPPORTED: clang-10, clang-9, clang-8, clang-7
+// icc compiler does not support detach clause.
+// UNSUPPORTED: icc
+// REQUIRES: tsan
+
+#include <omp.h>
+#include <stdio.h>
+#include <unistd.h>
+
+int main() {
+#pragma omp parallel
+#pragma omp master
+  {
+    omp_event_handle_t event;
+    int a = 0, b = 0;
+    omp_event_handle_t *f_event;
+#pragma omp task detach(event) depend(out : f_event) shared(f_event)
+    {
+      printf("%i: task 1\n", omp_get_thread_num());
+      f_event = &event;
+    }
+    usleep(10000);
+#pragma omp task depend(in : f_event) shared(f_event, a, b)
+    {
+      printf("%i: task 2, %p, %i, %i\n", omp_get_thread_num(), f_event, a, b);
+      f_event = &event;
+    }
+    usleep(10000);
+    a++;
+    printf("%i: calling omp_fulfill_event\n", omp_get_thread_num());
+    omp_fulfill_event(event);
+//#pragma omp task if (0) depend(in : f_event)
+//    {}
+    b++;
+    usleep(10000);
+#pragma omp taskwait
+  }
+  return 0;
+}
+
+// no race for a++ in line 32:
+// CHECK-NOT: #0 {{.*}}task_late_fulfill.c:35
+
+// CHECK: WARNING: ThreadSanitizer: data race
+// CHECK-NEXT:   {{(Write|Read)}} of size 4
+// CHECK-NEXT: #0 {{.*}}task_late_fulfill.c:31
+// CHECK:   Previous write of size 4
+// CHECK-NEXT: #0 {{.*}}task_late_fulfill.c:40
diff --git a/tools/archer/tests/worksharing/ordered.c b/tools/archer/tests/worksharing/ordered.c
new file mode 100644
index 000000000..e10d9d153
--- /dev/null
+++ b/tools/archer/tests/worksharing/ordered.c
@@ -0,0 +1,39 @@
+/*
+ * ordered.c -- Archer testcase
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//
+// See tools/archer/LICENSE.txt for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+// RUN: %libarcher-compile-and-run | FileCheck %s
+// REQUIRES: tsan
+#include <omp.h>
+#include <stdio.h>
+
+#define NUM_THREADS 2
+
+int main(int argc, char *argv[]) {
+  int var = 0;
+  int i;
+
+#pragma omp parallel for ordered num_threads(NUM_THREADS) shared(var)
+  for (i = 0; i < NUM_THREADS; i++) {
+#pragma omp ordered
+    { var++; }
+  }
+
+  fprintf(stderr, "DONE\n");
+  int error = (var != 2);
+  return error;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK-NOT: ThreadSanitizer: reported
+// CHECK: DONE
diff --git a/tools/multiplex/CMakeLists.txt b/tools/multiplex/CMakeLists.txt
new file mode 100644
index 000000000..81c897649
--- /dev/null
+++ b/tools/multiplex/CMakeLists.txt
@@ -0,0 +1,10 @@
+if(LIBOMP_OMPT_SUPPORT)
+  include_directories(${LIBOMP_INCLUDE_DIR})
+
+  add_library(bolt-ompt-multiplex INTERFACE)
+  target_include_directories(bolt-ompt-multiplex INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
+
+  install(FILES ompt-multiplex.h DESTINATION include)
+
+  add_subdirectory(tests)
+endif()
diff --git a/tools/multiplex/README.md b/tools/multiplex/README.md
new file mode 100644
index 000000000..601a14a41
--- /dev/null
+++ b/tools/multiplex/README.md
@@ -0,0 +1,60 @@
+# OMPT-Multiplexing
+The OMPT-Multiplexing header file allows a tool to load a second tool to 
+overcome the restriction of the OpenMP to only load one tool at a time. 
+The header file can also be used to load more than two tools using a cascade 
+of tools that include the header file. OMPT-Multiplexing takes care of the 
+multiplexing of OMPT callbacks, data pointers and runtime entry functions.
+
+Examples can be found under ./tests
+
+## Prerequisits
+- LLVM/OpenMP runtime with OMPT (https://github.com/OpenMPToolsInterface/LLVM-openmp)
+- LLVM-lit
+
+### Getting LLVM-lit
+Either build llvm and find lit+FileCheck in build directory of llvm or install using `pip`:
+```
+ $ pip install --upgrade --user pip
+ $ export PATH=$HOME/.local/bin:$PATH
+ $ export PYTHONPATH=$HOME/.local/lib/python3.*/site-packages/
+ $ pip install --user lit
+```
+
+## How to test
+```
+ $ make check-ompt-multiplex
+```
+
+## How to compile and use your OpenMP tools
+Code of first tool must include the following with the convention, that the environment variable containing the path to the client tool is the tool name with the suffix "_TOOL_LIBRARIES":
+```
+#define CLIENT_TOOL_LIBRARIES_VAR "EXAMPLE_TOOL_LIBRARIES"
+#include <ompt-multiplex.h>
+```
+Note that functions and variables with prefix "ompt_multiplex" are reserved by the tool
+
+
+To use both tools execute the following:
+```
+ $ clang -fopenmp -o program.exe
+ $ OMP_TOOL_LIBRARIES=/path/to/first/tool.so EXAMPLE_TOOL_LBRARIES=/path/to/second/tool.so ./program.exe
+```
+Note that EXAMPLE_TOOL_LIBRARIES may also contain a list of paths to tools which will be tried to load in order (similar to lists in OMP_TOOL_LIBRARIES).
+
+## Advanced usage
+To reduce the amount of memory allocations, the user can define macros before including the ompt-multiplex.h file, that specify custom data access handlers:
+
+```
+#define OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_THREAD_DATA get_client_thread_data
+#define OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_PARALLEL_DATA get_client_parallel_data
+#define OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_TASK_DATA get_client_task_data
+```
+
+This will reverse the calling order of the current tool and its client. In order to avoid this, one can specify a custom delete handler as well:
+
+```
+#define OMPT_MULTIPLEX_CUSTOM_DELETE_THREAD_DATA delete_thread_data
+#define OMPT_MULTIPLEX_CUSTOM_DELETE_PARALLEL_DATA delete_parallel_data
+#define OMPT_MULTIPLEX_CUSTOM_DELETE_TASK_DATA delete_task_data
+```
+
diff --git a/tools/multiplex/ompt-multiplex.h b/tools/multiplex/ompt-multiplex.h
new file mode 100644
index 000000000..e16189f93
--- /dev/null
+++ b/tools/multiplex/ompt-multiplex.h
@@ -0,0 +1,1094 @@
+//===--- ompt-multiplex.h - header-only multiplexing of OMPT tools -- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file enables an OMPT tool to load another OMPT tool and
+// automatically forwards OMPT event-callbacks to the nested tool.
+//
+// For details see openmp/tools/multiplex/README.md
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPT_MULTIPLEX_H
+#define OMPT_MULTIPLEX_H
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <dlfcn.h>
+#include <execinfo.h>
+#include <inttypes.h>
+#include <omp-tools.h>
+#include <omp.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+static ompt_set_callback_t ompt_multiplex_set_callback;
+static ompt_get_task_info_t ompt_multiplex_get_task_info;
+static ompt_get_thread_data_t ompt_multiplex_get_thread_data;
+static ompt_get_parallel_info_t ompt_multiplex_get_parallel_info;
+
+// contains name of the environment var in which the tool path is specified
+#ifndef CLIENT_TOOL_LIBRARIES_VAR
+#error CLIENT_TOOL_LIBRARIES_VAR should be defined before including of ompt-multiplex.h
+#endif
+
+#if defined(CUSTOM_DELETE_DATA) && !defined(CUSTOM_GET_CLIENT_DATA)
+#error CUSTOM_GET_CLIENT_DATA must be set if CUSTOM_DELETE_DATA is set
+#endif
+
+#define OMPT_API_ROUTINE static
+
+#define OMPT_LOAD_CLIENT_FOREACH_OMPT_EVENT(macro)                             \
+  macro(callback_thread_begin, ompt_callback_thread_begin_t, 1);               \
+  macro(callback_thread_end, ompt_callback_thread_end_t, 2);                   \
+  macro(callback_parallel_begin, ompt_callback_parallel_begin_t, 3);           \
+  macro(callback_parallel_end, ompt_callback_parallel_end_t, 4);               \
+  macro(callback_task_create, ompt_callback_task_create_t, 5);                 \
+  macro(callback_task_schedule, ompt_callback_task_schedule_t, 6);             \
+  macro(callback_implicit_task, ompt_callback_implicit_task_t, 7);             \
+  macro(callback_target, ompt_callback_target_t, 8);                           \
+  macro(callback_target_data_op, ompt_callback_target_data_op_t, 9);           \
+  macro(callback_target_submit, ompt_callback_target_submit_t, 10);            \
+  macro(callback_control_tool, ompt_callback_control_tool_t, 11);              \
+  macro(callback_device_initialize, ompt_callback_device_initialize_t, 12);    \
+  macro(callback_device_finalize, ompt_callback_device_finalize_t, 13);        \
+  macro(callback_device_load, ompt_callback_device_load_t, 14);                \
+  macro(callback_device_unload, ompt_callback_device_unload_t, 15);            \
+  macro(callback_sync_region_wait, ompt_callback_sync_region_t, 16);           \
+  macro(callback_mutex_released, ompt_callback_mutex_t, 17);                   \
+  macro(callback_dependences, ompt_callback_dependences_t, 18);                \
+  macro(callback_task_dependence, ompt_callback_task_dependence_t, 19);        \
+  macro(callback_work, ompt_callback_work_t, 20);                              \
+  macro(callback_master, ompt_callback_master_t, 21);                          \
+  macro(callback_target_map, ompt_callback_target_map_t, 22);                  \
+  macro(callback_sync_region, ompt_callback_sync_region_t, 23);                \
+  macro(callback_lock_init, ompt_callback_mutex_acquire_t, 24);                \
+  macro(callback_lock_destroy, ompt_callback_mutex_t, 25);                     \
+  macro(callback_mutex_acquire, ompt_callback_mutex_acquire_t, 26);            \
+  macro(callback_mutex_acquired, ompt_callback_mutex_t, 27);                   \
+  macro(callback_nest_lock, ompt_callback_nest_lock_t, 28);                    \
+  macro(callback_flush, ompt_callback_flush_t, 29);                            \
+  macro(callback_cancel, ompt_callback_cancel_t, 30);                          \
+  macro(callback_reduction, ompt_callback_sync_region_t, 31);                  \
+  macro(callback_dispatch, ompt_callback_dispatch_t, 32);
+
+typedef struct ompt_multiplex_callbacks_s {
+#define ompt_event_macro(event, callback, eventid) callback ompt_##event
+
+  OMPT_LOAD_CLIENT_FOREACH_OMPT_EVENT(ompt_event_macro)
+
+#undef ompt_event_macro
+} ompt_multiplex_callbacks_t;
+
+typedef struct ompt_multiplex_callback_implementation_status_s {
+#define ompt_event_macro(event, callback, eventid) int ompt_##event
+
+  OMPT_LOAD_CLIENT_FOREACH_OMPT_EVENT(ompt_event_macro)
+
+#undef ompt_event_macro
+} ompt_multiplex_callback_implementation_status_t;
+
+ompt_start_tool_result_t *ompt_multiplex_own_fns;
+ompt_start_tool_result_t *ompt_multiplex_client_fns;
+ompt_function_lookup_t ompt_multiplex_lookup_function;
+ompt_multiplex_callbacks_t ompt_multiplex_own_callbacks,
+    ompt_multiplex_client_callbacks;
+ompt_multiplex_callback_implementation_status_t
+    ompt_multiplex_implementation_status;
+
+typedef struct ompt_multiplex_data_pair_s {
+  ompt_data_t own_data;
+  ompt_data_t client_data;
+} ompt_multiplex_data_pair_t;
+
+#if !defined(OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_THREAD_DATA) ||                  \
+    !defined(OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_PARALLEL_DATA) ||                \
+    !defined(OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_TASK_DATA)
+static ompt_multiplex_data_pair_t *
+ompt_multiplex_allocate_data_pair(ompt_data_t *data_pointer) {
+  data_pointer->ptr = malloc(sizeof(ompt_multiplex_data_pair_t));
+  if (!data_pointer->ptr) {
+    printf("Malloc ERROR\n");
+    exit(-1);
+  }
+  ompt_multiplex_data_pair_t *data_pair =
+      (ompt_multiplex_data_pair_t *)data_pointer->ptr;
+  data_pair->own_data.ptr = NULL;
+  data_pair->client_data.ptr = NULL;
+  return data_pair;
+}
+
+static void ompt_multiplex_free_data_pair(ompt_data_t *data_pointer) {
+  free((*data_pointer).ptr);
+}
+
+static ompt_data_t *ompt_multiplex_get_own_ompt_data(ompt_data_t *data) {
+  if (!data)
+    return NULL;
+  ompt_multiplex_data_pair_t *data_pair =
+      (ompt_multiplex_data_pair_t *)data->ptr;
+  return &(data_pair->own_data);
+}
+
+static ompt_data_t *ompt_multiplex_get_client_ompt_data(ompt_data_t *data) {
+  if (!data)
+    return NULL;
+  ompt_multiplex_data_pair_t *data_pair =
+      (ompt_multiplex_data_pair_t *)data->ptr;
+  return &(data_pair->client_data);
+}
+#endif //! defined(OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_THREAD_DATA) ||
+       //! !defined(OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_PARALLEL_DATA) ||
+       //! !defined(OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_TASK_DATA)
+
+static ompt_data_t *ompt_multiplex_get_own_thread_data(ompt_data_t *data) {
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_THREAD_DATA
+  return ompt_multiplex_get_own_ompt_data(data);
+#else
+  return data;
+#endif
+}
+
+static ompt_data_t *ompt_multiplex_get_own_parallel_data(ompt_data_t *data) {
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_PARALLEL_DATA
+  return ompt_multiplex_get_own_ompt_data(data);
+#else
+  return data;
+#endif
+}
+
+static ompt_data_t *ompt_multiplex_get_own_task_data(ompt_data_t *data) {
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_TASK_DATA
+  return ompt_multiplex_get_own_ompt_data(data);
+#else
+  return data;
+#endif
+}
+
+static ompt_data_t *ompt_multiplex_get_client_thread_data(ompt_data_t *data) {
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_THREAD_DATA
+  return ompt_multiplex_get_client_ompt_data(data);
+#else
+  return OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_THREAD_DATA(data);
+#endif
+}
+
+static ompt_data_t *ompt_multiplex_get_client_parallel_data(ompt_data_t *data) {
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_PARALLEL_DATA
+  return ompt_multiplex_get_client_ompt_data(data);
+#else
+  return OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_PARALLEL_DATA(data);
+#endif
+}
+
+static ompt_data_t *ompt_multiplex_get_client_task_data(ompt_data_t *data) {
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_TASK_DATA
+  return ompt_multiplex_get_client_ompt_data(data);
+#else
+  return OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_TASK_DATA(data);
+#endif
+}
+
+static void ompt_multiplex_callback_mutex_acquire(ompt_mutex_t kind,
+                                                  unsigned int hint,
+                                                  unsigned int impl,
+                                                  ompt_wait_id_t wait_id,
+                                                  const void *codeptr_ra) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_mutex_acquire) {
+    ompt_multiplex_own_callbacks.ompt_callback_mutex_acquire(
+        kind, hint, impl, wait_id, codeptr_ra);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_mutex_acquire) {
+    ompt_multiplex_client_callbacks.ompt_callback_mutex_acquire(
+        kind, hint, impl, wait_id, codeptr_ra);
+  }
+}
+
+static void ompt_multiplex_callback_mutex_acquired(ompt_mutex_t kind,
+                                                   ompt_wait_id_t wait_id,
+                                                   const void *codeptr_ra) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_mutex_acquired) {
+    ompt_multiplex_own_callbacks.ompt_callback_mutex_acquired(kind, wait_id,
+                                                              codeptr_ra);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_mutex_acquired) {
+    ompt_multiplex_client_callbacks.ompt_callback_mutex_acquired(kind, wait_id,
+                                                                 codeptr_ra);
+  }
+}
+
+static void ompt_multiplex_callback_mutex_released(ompt_mutex_t kind,
+                                                   ompt_wait_id_t wait_id,
+                                                   const void *codeptr_ra) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_mutex_released) {
+    ompt_multiplex_own_callbacks.ompt_callback_mutex_released(kind, wait_id,
+                                                              codeptr_ra);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_mutex_released) {
+    ompt_multiplex_client_callbacks.ompt_callback_mutex_released(kind, wait_id,
+                                                                 codeptr_ra);
+  }
+}
+
+static void ompt_multiplex_callback_nest_lock(ompt_scope_endpoint_t endpoint,
+                                              ompt_wait_id_t wait_id,
+                                              const void *codeptr_ra) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_nest_lock) {
+    ompt_multiplex_own_callbacks.ompt_callback_nest_lock(endpoint, wait_id,
+                                                         codeptr_ra);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_nest_lock) {
+    ompt_multiplex_client_callbacks.ompt_callback_nest_lock(endpoint, wait_id,
+                                                            codeptr_ra);
+  }
+}
+
+static void ompt_multiplex_callback_sync_region(ompt_sync_region_t kind,
+                                                ompt_scope_endpoint_t endpoint,
+                                                ompt_data_t *parallel_data,
+                                                ompt_data_t *task_data,
+                                                const void *codeptr_ra) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_sync_region) {
+    ompt_multiplex_own_callbacks.ompt_callback_sync_region(
+        kind, endpoint, ompt_multiplex_get_own_parallel_data(parallel_data),
+        ompt_multiplex_get_own_task_data(task_data), codeptr_ra);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_sync_region) {
+    ompt_multiplex_client_callbacks.ompt_callback_sync_region(
+        kind, endpoint, ompt_multiplex_get_client_parallel_data(parallel_data),
+        ompt_multiplex_get_client_task_data(task_data), codeptr_ra);
+  }
+}
+
+static void ompt_multiplex_callback_sync_region_wait(
+    ompt_sync_region_t kind, ompt_scope_endpoint_t endpoint,
+    ompt_data_t *parallel_data, ompt_data_t *task_data,
+    const void *codeptr_ra) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_sync_region_wait) {
+    ompt_multiplex_own_callbacks.ompt_callback_sync_region_wait(
+        kind, endpoint, ompt_multiplex_get_own_parallel_data(parallel_data),
+        ompt_multiplex_get_own_task_data(task_data), codeptr_ra);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_sync_region_wait) {
+    ompt_multiplex_client_callbacks.ompt_callback_sync_region_wait(
+        kind, endpoint, ompt_multiplex_get_client_parallel_data(parallel_data),
+        ompt_multiplex_get_client_task_data(task_data), codeptr_ra);
+  }
+}
+
+static void ompt_multiplex_callback_flush(ompt_data_t *thread_data,
+                                          const void *codeptr_ra) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_flush) {
+    ompt_multiplex_own_callbacks.ompt_callback_flush(
+        ompt_multiplex_get_own_thread_data(thread_data), codeptr_ra);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_flush) {
+    ompt_multiplex_client_callbacks.ompt_callback_flush(
+        ompt_multiplex_get_client_thread_data(thread_data), codeptr_ra);
+  }
+}
+
+static void ompt_multiplex_callback_cancel(ompt_data_t *task_data, int flags,
+                                           const void *codeptr_ra) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_cancel) {
+    ompt_multiplex_own_callbacks.ompt_callback_cancel(
+        ompt_multiplex_get_own_task_data(task_data), flags, codeptr_ra);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_cancel) {
+    ompt_multiplex_client_callbacks.ompt_callback_cancel(
+        ompt_multiplex_get_client_task_data(task_data), flags, codeptr_ra);
+  }
+}
+
+static void ompt_multiplex_callback_implicit_task(
+    ompt_scope_endpoint_t endpoint, ompt_data_t *parallel_data,
+    ompt_data_t *task_data, unsigned int team_size, unsigned int thread_num,
+    int flags) {
+  if (endpoint == ompt_scope_begin) {
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_TASK_DATA
+    ompt_multiplex_allocate_data_pair(task_data);
+#endif
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_PARALLEL_DATA
+    if (flags & ompt_task_initial)
+      ompt_multiplex_allocate_data_pair(parallel_data);
+#endif
+    if (ompt_multiplex_own_callbacks.ompt_callback_implicit_task) {
+      ompt_multiplex_own_callbacks.ompt_callback_implicit_task(
+          endpoint, ompt_multiplex_get_own_parallel_data(parallel_data),
+          ompt_multiplex_get_own_task_data(task_data), team_size, thread_num,
+          flags);
+    }
+    if (ompt_multiplex_client_callbacks.ompt_callback_implicit_task) {
+      ompt_multiplex_client_callbacks.ompt_callback_implicit_task(
+          endpoint, ompt_multiplex_get_client_parallel_data(parallel_data),
+          ompt_multiplex_get_client_task_data(task_data), team_size, thread_num,
+          flags);
+    }
+  } else {
+// defines to make sure, callbacks are called in correct order depending on
+// defines set by the user
+#if defined(OMPT_MULTIPLEX_CUSTOM_DELETE_TASK_DATA) ||                         \
+    !defined(OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_TASK_DATA)
+    if (ompt_multiplex_own_callbacks.ompt_callback_implicit_task) {
+      ompt_multiplex_own_callbacks.ompt_callback_implicit_task(
+          endpoint, ompt_multiplex_get_own_parallel_data(parallel_data),
+          ompt_multiplex_get_own_task_data(task_data), team_size, thread_num,
+          flags);
+    }
+#endif
+
+    if (ompt_multiplex_client_callbacks.ompt_callback_implicit_task) {
+      ompt_multiplex_client_callbacks.ompt_callback_implicit_task(
+          endpoint, ompt_multiplex_get_client_parallel_data(parallel_data),
+          ompt_multiplex_get_client_task_data(task_data), team_size, thread_num,
+          flags);
+    }
+
+#if defined(OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_TASK_DATA) &&                     \
+    !defined(OMPT_MULTIPLEX_CUSTOM_DELETE_TASK_DATA)
+    if (ompt_multiplex_own_callbacks.ompt_callback_implicit_task) {
+      ompt_multiplex_own_callbacks.ompt_callback_implicit_task(
+          endpoint, ompt_multiplex_get_own_parallel_data(parallel_data),
+          ompt_multiplex_get_own_task_data(task_data), team_size, thread_num,
+          flags);
+    }
+#endif
+
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_TASK_DATA
+    ompt_multiplex_free_data_pair(task_data);
+#endif
+
+#if defined(OMPT_MULTIPLEX_CUSTOM_DELETE_PARALLEL_DATA)
+    if (flags & ompt_task_initial)
+      OMPT_MULTIPLEX_CUSTOM_DELETE_PARALLEL_DATA(parallel_data);
+#endif
+#if defined(OMPT_MULTIPLEX_CUSTOM_DELETE_TASK_DATA)
+    OMPT_MULTIPLEX_CUSTOM_DELETE_TASK_DATA(task_data);
+#endif
+  }
+}
+
+static void ompt_multiplex_callback_lock_init(ompt_mutex_t kind,
+                                              unsigned int hint,
+                                              unsigned int impl,
+                                              ompt_wait_id_t wait_id,
+                                              const void *codeptr_ra) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_lock_init) {
+    ompt_multiplex_own_callbacks.ompt_callback_lock_init(kind, hint, impl,
+                                                         wait_id, codeptr_ra);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_lock_init) {
+    ompt_multiplex_client_callbacks.ompt_callback_lock_init(
+        kind, hint, impl, wait_id, codeptr_ra);
+  }
+}
+
+static void ompt_multiplex_callback_lock_destroy(ompt_mutex_t kind,
+                                                 ompt_wait_id_t wait_id,
+                                                 const void *codeptr_ra) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_lock_destroy) {
+    ompt_multiplex_own_callbacks.ompt_callback_lock_destroy(kind, wait_id,
+                                                            codeptr_ra);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_lock_destroy) {
+    ompt_multiplex_client_callbacks.ompt_callback_lock_destroy(kind, wait_id,
+                                                               codeptr_ra);
+  }
+}
+
+static void ompt_multiplex_callback_work(ompt_work_t wstype,
+                                         ompt_scope_endpoint_t endpoint,
+                                         ompt_data_t *parallel_data,
+                                         ompt_data_t *task_data, uint64_t count,
+                                         const void *codeptr_ra) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_work) {
+    ompt_multiplex_own_callbacks.ompt_callback_work(
+        wstype, endpoint, ompt_multiplex_get_own_parallel_data(parallel_data),
+        ompt_multiplex_get_own_task_data(task_data), count, codeptr_ra);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_work) {
+    ompt_multiplex_client_callbacks.ompt_callback_work(
+        wstype, endpoint,
+        ompt_multiplex_get_client_parallel_data(parallel_data),
+        ompt_multiplex_get_client_task_data(task_data), count, codeptr_ra);
+  }
+}
+
+static void ompt_multiplex_callback_master(ompt_scope_endpoint_t endpoint,
+                                           ompt_data_t *parallel_data,
+                                           ompt_data_t *task_data,
+                                           const void *codeptr_ra) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_master) {
+    ompt_multiplex_own_callbacks.ompt_callback_master(
+        endpoint, ompt_multiplex_get_own_parallel_data(parallel_data),
+        ompt_multiplex_get_own_task_data(task_data), codeptr_ra);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_master) {
+    ompt_multiplex_client_callbacks.ompt_callback_master(
+        endpoint, ompt_multiplex_get_client_parallel_data(parallel_data),
+        ompt_multiplex_get_client_task_data(task_data), codeptr_ra);
+  }
+}
+
+static void ompt_multiplex_callback_parallel_begin(
+    ompt_data_t *parent_task_data, const ompt_frame_t *parent_task_frame,
+    ompt_data_t *parallel_data, uint32_t requested_team_size, int flag,
+    const void *codeptr_ra) {
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_PARALLEL_DATA
+  ompt_multiplex_allocate_data_pair(parallel_data);
+#endif
+  if (ompt_multiplex_own_callbacks.ompt_callback_parallel_begin) {
+    ompt_multiplex_own_callbacks.ompt_callback_parallel_begin(
+        ompt_multiplex_get_own_task_data(parent_task_data), parent_task_frame,
+        ompt_multiplex_get_own_parallel_data(parallel_data),
+        requested_team_size, flag, codeptr_ra);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_parallel_begin) {
+    ompt_multiplex_client_callbacks.ompt_callback_parallel_begin(
+        ompt_multiplex_get_client_task_data(parent_task_data),
+        parent_task_frame,
+        ompt_multiplex_get_client_parallel_data(parallel_data),
+        requested_team_size, flag, codeptr_ra);
+  }
+}
+
+static void ompt_multiplex_callback_parallel_end(ompt_data_t *parallel_data,
+                                                 ompt_data_t *task_data,
+                                                 int flag,
+                                                 const void *codeptr_ra) {
+// defines to make sure, callbacks are called in correct order depending on
+// defines set by the user
+#if defined(OMPT_MULTIPLEX_CUSTOM_DELETE_PARALLEL_DATA) ||                     \
+    !defined(OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_PARALLEL_DATA)
+  if (ompt_multiplex_own_callbacks.ompt_callback_parallel_end) {
+    ompt_multiplex_own_callbacks.ompt_callback_parallel_end(
+        ompt_multiplex_get_own_parallel_data(parallel_data),
+        ompt_multiplex_get_own_task_data(task_data), flag, codeptr_ra);
+  }
+#endif
+
+  if (ompt_multiplex_client_callbacks.ompt_callback_parallel_end) {
+    ompt_multiplex_client_callbacks.ompt_callback_parallel_end(
+        ompt_multiplex_get_client_parallel_data(parallel_data),
+        ompt_multiplex_get_client_task_data(task_data), flag, codeptr_ra);
+  }
+
+#if defined(OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_PARALLEL_DATA) &&                 \
+    !defined(OMPT_MULTIPLEX_CUSTOM_DELETE_PARALLEL_DATA)
+  if (ompt_multiplex_own_callbacks.ompt_callback_parallel_end) {
+    ompt_multiplex_own_callbacks.ompt_callback_parallel_end(
+        ompt_multiplex_get_own_parallel_data(parallel_data),
+        ompt_multiplex_get_own_task_data(task_data), flag, codeptr_ra);
+  }
+#endif
+
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_PARALLEL_DATA
+  ompt_multiplex_free_data_pair(parallel_data);
+#endif
+
+#if defined(OMPT_MULTIPLEX_CUSTOM_DELETE_PARALLEL_DATA)
+  OMPT_MULTIPLEX_CUSTOM_DELETE_PARALLEL_DATA(parallel_data);
+#endif
+}
+
+static void ompt_multiplex_callback_task_create(
+    ompt_data_t *parent_task_data, const ompt_frame_t *parent_frame,
+    ompt_data_t *new_task_data, int type, int has_dependences,
+    const void *codeptr_ra) {
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_TASK_DATA
+  ompt_multiplex_allocate_data_pair(new_task_data);
+#endif
+
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_PARALLEL_DATA
+  if (type & ompt_task_initial) {
+    ompt_data_t *parallel_data;
+    ompt_multiplex_get_parallel_info(0, &parallel_data, NULL);
+    ompt_multiplex_allocate_data_pair(parallel_data);
+  }
+#endif
+
+  if (ompt_multiplex_own_callbacks.ompt_callback_task_create) {
+    ompt_multiplex_own_callbacks.ompt_callback_task_create(
+        ompt_multiplex_get_own_task_data(parent_task_data), parent_frame,
+        ompt_multiplex_get_own_task_data(new_task_data), type, has_dependences,
+        codeptr_ra);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_task_create) {
+    ompt_multiplex_client_callbacks.ompt_callback_task_create(
+        ompt_multiplex_get_client_task_data(parent_task_data), parent_frame,
+        ompt_multiplex_get_client_task_data(new_task_data), type,
+        has_dependences, codeptr_ra);
+  }
+}
+
+static void
+ompt_multiplex_callback_task_schedule(ompt_data_t *first_task_data,
+                                      ompt_task_status_t prior_task_status,
+                                      ompt_data_t *second_task_data) {
+  if (prior_task_status != ompt_task_complete) {
+    if (ompt_multiplex_own_callbacks.ompt_callback_task_schedule) {
+      ompt_multiplex_own_callbacks.ompt_callback_task_schedule(
+          ompt_multiplex_get_own_task_data(first_task_data), prior_task_status,
+          ompt_multiplex_get_own_task_data(second_task_data));
+    }
+    if (ompt_multiplex_client_callbacks.ompt_callback_task_schedule) {
+      ompt_multiplex_client_callbacks.ompt_callback_task_schedule(
+          ompt_multiplex_get_client_task_data(first_task_data),
+          prior_task_status,
+          ompt_multiplex_get_client_task_data(second_task_data));
+    }
+  } else {
+// defines to make sure, callbacks are called in correct order depending on
+// defines set by the user
+#if defined(OMPT_MULTIPLEX_CUSTOM_DELETE_TASK_DATA) ||                         \
+    !defined(OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_TASK_DATA)
+    if (ompt_multiplex_own_callbacks.ompt_callback_task_schedule) {
+      ompt_multiplex_own_callbacks.ompt_callback_task_schedule(
+          ompt_multiplex_get_own_task_data(first_task_data), prior_task_status,
+          ompt_multiplex_get_own_task_data(second_task_data));
+    }
+#endif
+
+    if (ompt_multiplex_client_callbacks.ompt_callback_task_schedule) {
+      ompt_multiplex_client_callbacks.ompt_callback_task_schedule(
+          ompt_multiplex_get_client_task_data(first_task_data),
+          prior_task_status,
+          ompt_multiplex_get_client_task_data(second_task_data));
+    }
+
+#if defined(OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_TASK_DATA) &&                     \
+    !defined(OMPT_MULTIPLEX_CUSTOM_DELETE_TASK_DATA)
+    if (ompt_multiplex_own_callbacks.ompt_callback_task_schedule) {
+      ompt_multiplex_own_callbacks.ompt_callback_task_schedule(
+          ompt_multiplex_get_own_task_data(first_task_data), prior_task_status,
+          ompt_multiplex_get_own_task_data(second_task_data));
+    }
+#endif
+
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_TASK_DATA
+    ompt_multiplex_free_data_pair(first_task_data);
+#endif
+
+#if defined(OMPT_MULTIPLEX_CUSTOM_DELETE_TASK_DATA)
+    OMPT_MULTIPLEX_CUSTOM_DELETE_TASK_DATA(first_task_data);
+#endif
+  }
+}
+
+static void ompt_multiplex_callback_dependences(ompt_data_t *task_data,
+                                                const ompt_dependence_t *deps,
+                                                int ndeps) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_dependences) {
+    ompt_multiplex_own_callbacks.ompt_callback_dependences(
+        ompt_multiplex_get_own_task_data(task_data), deps, ndeps);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_dependences) {
+    ompt_multiplex_client_callbacks.ompt_callback_dependences(
+        ompt_multiplex_get_client_task_data(task_data), deps, ndeps);
+  }
+}
+
+static void
+ompt_multiplex_callback_task_dependence(ompt_data_t *first_task_data,
+                                        ompt_data_t *second_task_data) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_task_dependence) {
+    ompt_multiplex_own_callbacks.ompt_callback_task_dependence(
+        ompt_multiplex_get_own_task_data(first_task_data),
+        ompt_multiplex_get_own_task_data(second_task_data));
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_task_dependence) {
+    ompt_multiplex_client_callbacks.ompt_callback_task_dependence(
+        ompt_multiplex_get_client_task_data(first_task_data),
+        ompt_multiplex_get_client_task_data(second_task_data));
+  }
+}
+
+static void ompt_multiplex_callback_thread_begin(ompt_thread_t thread_type,
+                                                 ompt_data_t *thread_data) {
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_THREAD_DATA
+  ompt_multiplex_allocate_data_pair(thread_data);
+#endif
+  if (ompt_multiplex_own_callbacks.ompt_callback_thread_begin) {
+    ompt_multiplex_own_callbacks.ompt_callback_thread_begin(
+        thread_type, ompt_multiplex_get_own_thread_data(thread_data));
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_thread_begin) {
+    ompt_multiplex_client_callbacks.ompt_callback_thread_begin(
+        thread_type, ompt_multiplex_get_client_thread_data(thread_data));
+  }
+}
+
+static void ompt_multiplex_callback_thread_end(ompt_data_t *thread_data) {
+// defines to make sure, callbacks are called in correct order depending on
+// defines set by the user
+#if defined(OMPT_MULTIPLEX_CUSTOM_DELETE_THREAD_DATA) ||                       \
+    !defined(OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_THREAD_DATA)
+  if (ompt_multiplex_own_callbacks.ompt_callback_thread_end) {
+    ompt_multiplex_own_callbacks.ompt_callback_thread_end(
+        ompt_multiplex_get_own_thread_data(thread_data));
+  }
+#endif
+
+  if (ompt_multiplex_client_callbacks.ompt_callback_thread_end) {
+    ompt_multiplex_client_callbacks.ompt_callback_thread_end(
+        ompt_multiplex_get_client_thread_data(thread_data));
+  }
+
+#if defined(OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_THREAD_DATA) &&                   \
+    !defined(OMPT_MULTIPLEX_CUSTOM_DELETE_THREAD_DATA)
+  if (ompt_multiplex_own_callbacks.ompt_callback_thread_end) {
+    ompt_multiplex_own_callbacks.ompt_callback_thread_end(
+        ompt_multiplex_get_own_thread_data(thread_data));
+  }
+#endif
+
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_THREAD_DATA
+  ompt_multiplex_free_data_pair(thread_data);
+#endif
+
+#if defined(OMPT_MULTIPLEX_CUSTOM_DELETE_THREAD_DATA)
+  OMPT_MULTIPLEX_CUSTOM_DELETE_THREAD_DATA(thread_data);
+#endif
+}
+
+static int ompt_multiplex_callback_control_tool(uint64_t command,
+                                                uint64_t modifier, void *arg,
+                                                const void *codeptr_ra) {
+  int ownRet = 0, clientRet = 0;
+  if (ompt_multiplex_own_callbacks.ompt_callback_control_tool) {
+    ownRet = ompt_multiplex_own_callbacks.ompt_callback_control_tool(
+        command, modifier, arg, codeptr_ra);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_control_tool) {
+    clientRet = ompt_multiplex_client_callbacks.ompt_callback_control_tool(
+        command, modifier, arg, codeptr_ra);
+  }
+  return ownRet < clientRet ? ownRet : clientRet;
+}
+
+static void ompt_multiplex_callback_target(
+    ompt_target_t kind, ompt_scope_endpoint_t endpoint, int device_num,
+    ompt_data_t *task_data, ompt_id_t target_id, const void *codeptr_ra) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_target) {
+    ompt_multiplex_own_callbacks.ompt_callback_target(
+        kind, endpoint, device_num, ompt_multiplex_get_own_task_data(task_data),
+        target_id, codeptr_ra);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_target) {
+    ompt_multiplex_client_callbacks.ompt_callback_target(
+        kind, endpoint, device_num,
+        ompt_multiplex_get_client_task_data(task_data), target_id, codeptr_ra);
+  }
+}
+
+static void ompt_multiplex_callback_target_data_op(
+    ompt_id_t target_id, ompt_id_t host_op_id, ompt_target_data_op_t optype,
+    void *src_addr, int src_device_num, void *dest_addr, int dest_device_num,
+    size_t bytes, const void *codeptr_ra) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_target_data_op) {
+    ompt_multiplex_own_callbacks.ompt_callback_target_data_op(
+        target_id, host_op_id, optype, src_addr, src_device_num, dest_addr,
+        dest_device_num, bytes, codeptr_ra);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_target_data_op) {
+    ompt_multiplex_client_callbacks.ompt_callback_target_data_op(
+        target_id, host_op_id, optype, src_addr, src_device_num, dest_addr,
+        dest_device_num, bytes, codeptr_ra);
+  }
+}
+
+static void
+ompt_multiplex_callback_target_submit(ompt_id_t target_id, ompt_id_t host_op_id,
+                                      unsigned int requested_num_teams) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_target_submit) {
+    ompt_multiplex_own_callbacks.ompt_callback_target_submit(
+        target_id, host_op_id, requested_num_teams);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_target_submit) {
+    ompt_multiplex_client_callbacks.ompt_callback_target_submit(
+        target_id, host_op_id, requested_num_teams);
+  }
+}
+
+static void ompt_multiplex_callback_device_initialize(
+    int device_num, const char *type, ompt_device_t *device,
+    ompt_function_lookup_t lookup, const char *documentation) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_device_initialize) {
+    ompt_multiplex_own_callbacks.ompt_callback_device_initialize(
+        device_num, type, device, lookup, documentation);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_device_initialize) {
+    ompt_multiplex_client_callbacks.ompt_callback_device_initialize(
+        device_num, type, device, lookup, documentation);
+  }
+}
+
+static void ompt_multiplex_callback_device_finalize(int device_num) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_device_finalize) {
+    ompt_multiplex_own_callbacks.ompt_callback_device_finalize(device_num);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_device_finalize) {
+    ompt_multiplex_client_callbacks.ompt_callback_device_finalize(device_num);
+  }
+}
+
+static void
+ompt_multiplex_callback_device_load(int device_num, const char *filename,
+                                    int64_t offset_in_file, void *vma_in_file,
+                                    size_t bytes, void *host_addr,
+                                    void *device_addr, uint64_t module_id) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_device_load) {
+    ompt_multiplex_own_callbacks.ompt_callback_device_load(
+        device_num, filename, offset_in_file, vma_in_file, bytes, host_addr,
+        device_addr, module_id);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_device_load) {
+    ompt_multiplex_client_callbacks.ompt_callback_device_load(
+        device_num, filename, offset_in_file, vma_in_file, bytes, host_addr,
+        device_addr, module_id);
+  }
+}
+
+static void ompt_multiplex_callback_device_unload(int device_num,
+                                                  uint64_t module_id) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_device_unload) {
+    ompt_multiplex_own_callbacks.ompt_callback_device_unload(device_num,
+                                                             module_id);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_device_unload) {
+    ompt_multiplex_client_callbacks.ompt_callback_device_unload(device_num,
+                                                                module_id);
+  }
+}
+
+static void
+ompt_multiplex_callback_target_map(ompt_id_t target_id, unsigned int nitems,
+                                   void **host_addr, void **device_addr,
+                                   size_t *bytes, unsigned int *mapping_flags,
+                                   const void *codeptr_ra) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_target_map) {
+    ompt_multiplex_own_callbacks.ompt_callback_target_map(
+        target_id, nitems, host_addr, device_addr, bytes, mapping_flags,
+        codeptr_ra);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_target_map) {
+    ompt_multiplex_client_callbacks.ompt_callback_target_map(
+        target_id, nitems, host_addr, device_addr, bytes, mapping_flags,
+        codeptr_ra);
+  }
+}
+
+static void ompt_multiplex_callback_reduction(ompt_sync_region_t kind,
+                                              ompt_scope_endpoint_t endpoint,
+                                              ompt_data_t *parallel_data,
+                                              ompt_data_t *task_data,
+                                              const void *codeptr_ra) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_reduction) {
+    ompt_multiplex_own_callbacks.ompt_callback_reduction(
+        kind, endpoint, ompt_multiplex_get_own_parallel_data(parallel_data),
+        ompt_multiplex_get_own_task_data(task_data), codeptr_ra);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_reduction) {
+    ompt_multiplex_client_callbacks.ompt_callback_reduction(
+        kind, endpoint, ompt_multiplex_get_client_parallel_data(parallel_data),
+        ompt_multiplex_get_client_task_data(task_data), codeptr_ra);
+  }
+}
+
+static void ompt_multiplex_callback_dispatch(ompt_data_t *parallel_data,
+                                             ompt_data_t *task_data,
+                                             ompt_dispatch_t kind,
+                                             ompt_data_t instance) {
+  if (ompt_multiplex_own_callbacks.ompt_callback_dispatch) {
+    ompt_multiplex_own_callbacks.ompt_callback_dispatch(
+        ompt_multiplex_get_own_parallel_data(parallel_data),
+        ompt_multiplex_get_own_task_data(task_data), kind, instance);
+  }
+  if (ompt_multiplex_client_callbacks.ompt_callback_dispatch) {
+    ompt_multiplex_client_callbacks.ompt_callback_dispatch(
+        ompt_multiplex_get_client_parallel_data(parallel_data),
+        ompt_multiplex_get_client_task_data(task_data), kind, instance);
+  }
+}
+
+// runtime entry functions
+
+int ompt_multiplex_own_get_task_info(int ancestor_level, int *type,
+                                     ompt_data_t **task_data,
+                                     ompt_frame_t **task_frame,
+                                     ompt_data_t **parallel_data,
+                                     int *thread_num) {
+  int ret = ompt_multiplex_get_task_info(ancestor_level, type, task_data,
+                                         task_frame, parallel_data, thread_num);
+
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_TASK_DATA
+  if (task_data)
+    *task_data = ompt_multiplex_get_own_ompt_data(*task_data);
+#endif
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_PARALLEL_DATA
+  if (parallel_data)
+    *parallel_data = ompt_multiplex_get_own_ompt_data(*parallel_data);
+#endif
+  return ret;
+}
+
+int ompt_multiplex_client_get_task_info(int ancestor_level, int *type,
+                                        ompt_data_t **task_data,
+                                        ompt_frame_t **task_frame,
+                                        ompt_data_t **parallel_data,
+                                        int *thread_num) {
+  int ret = ompt_multiplex_get_task_info(ancestor_level, type, task_data,
+                                         task_frame, parallel_data, thread_num);
+
+  if (task_data)
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_TASK_DATA
+    *task_data = ompt_multiplex_get_client_ompt_data(*task_data);
+#else
+    *task_data = OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_TASK_DATA(*task_data);
+#endif
+
+  if (parallel_data)
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_PARALLEL_DATA
+    *parallel_data = ompt_multiplex_get_client_ompt_data(*parallel_data);
+#else
+    *parallel_data =
+        OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_PARALLEL_DATA(*parallel_data);
+#endif
+  return ret;
+}
+
+ompt_data_t *ompt_multiplex_own_get_thread_data() {
+  ompt_data_t *ret;
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_THREAD_DATA
+  ret = ompt_multiplex_get_own_ompt_data(ompt_multiplex_get_thread_data());
+#else
+  ret = ompt_multiplex_get_thread_data();
+#endif
+  return ret;
+}
+
+ompt_data_t *ompt_multiplex_client_get_thread_data() {
+  ompt_data_t *ret;
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_THREAD_DATA
+  ret = ompt_multiplex_get_client_ompt_data(ompt_multiplex_get_thread_data());
+#else
+  ret = OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_THREAD_DATA(
+      ompt_multiplex_get_thread_data());
+#endif
+  return ret;
+}
+
+int ompt_multiplex_own_get_parallel_info(int ancestor_level,
+                                         ompt_data_t **parallel_data,
+                                         int *team_size) {
+  int ret = ompt_multiplex_get_parallel_info(ancestor_level, parallel_data,
+                                             team_size);
+  if (parallel_data)
+    *parallel_data = ompt_multiplex_get_own_parallel_data(*parallel_data);
+  return ret;
+}
+
+int ompt_multiplex_client_get_parallel_info(int ancestor_level,
+                                            ompt_data_t **parallel_data,
+                                            int *team_size) {
+  int ret = ompt_multiplex_get_parallel_info(ancestor_level, parallel_data,
+                                             team_size);
+  if (parallel_data)
+#ifndef OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_PARALLEL_DATA
+    *parallel_data = ompt_multiplex_get_client_ompt_data(*parallel_data);
+#else
+    *parallel_data =
+        OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_PARALLEL_DATA(*parallel_data);
+#endif
+  return ret;
+}
+
+OMPT_API_ROUTINE int ompt_multiplex_own_set_callback(ompt_callbacks_t which,
+                                                     ompt_callback_t callback) {
+  switch (which) {
+
+#define ompt_event_macro(event_name, callback_type, event_id)                  \
+  case ompt_##event_name:                                                      \
+    ompt_multiplex_own_callbacks.ompt_##event_name = (callback_type)callback;  \
+    if (ompt_multiplex_implementation_status.ompt_##event_name == -1)          \
+      return ompt_multiplex_implementation_status.ompt_##event_name =          \
+                 ompt_multiplex_set_callback(                                  \
+                     ompt_##event_name,                                        \
+                     (ompt_callback_t)&ompt_multiplex_##event_name);           \
+    else                                                                       \
+      return ompt_multiplex_implementation_status.ompt_##event_name
+
+    OMPT_LOAD_CLIENT_FOREACH_OMPT_EVENT(ompt_event_macro)
+
+#undef ompt_event_macro
+
+  default:
+    return ompt_set_error;
+  }
+}
+
+OMPT_API_ROUTINE int
+ompt_multiplex_client_set_callback(ompt_callbacks_t which,
+                                   ompt_callback_t callback) {
+  switch (which) {
+
+#define ompt_event_macro(event_name, callback_type, event_id)                  \
+  case ompt_##event_name:                                                      \
+    ompt_multiplex_client_callbacks.ompt_##event_name =                        \
+        (callback_type)callback;                                               \
+    if (ompt_multiplex_implementation_status.ompt_##event_name == -1)          \
+      return ompt_multiplex_implementation_status.ompt_##event_name =          \
+                 ompt_multiplex_set_callback(                                  \
+                     ompt_##event_name,                                        \
+                     (ompt_callback_t)&ompt_multiplex_##event_name);           \
+    else                                                                       \
+      return ompt_multiplex_implementation_status.ompt_##event_name
+
+    OMPT_LOAD_CLIENT_FOREACH_OMPT_EVENT(ompt_event_macro)
+
+#undef ompt_event_macro
+
+  default:
+    return ompt_set_error;
+  }
+}
+
+ompt_interface_fn_t ompt_multiplex_own_lookup(const char *name) {
+  if (!strcmp(name, "ompt_set_callback"))
+    return (ompt_interface_fn_t)&ompt_multiplex_own_set_callback;
+  else if (!strcmp(name, "ompt_get_task_info"))
+    return (ompt_interface_fn_t)&ompt_multiplex_own_get_task_info;
+  else if (!strcmp(name, "ompt_get_thread_data"))
+    return (ompt_interface_fn_t)&ompt_multiplex_own_get_thread_data;
+  else if (!strcmp(name, "ompt_get_parallel_info"))
+    return (ompt_interface_fn_t)&ompt_multiplex_own_get_parallel_info;
+  else
+    return ompt_multiplex_lookup_function(name);
+}
+
+ompt_interface_fn_t ompt_multiplex_client_lookup(const char *name) {
+  if (!strcmp(name, "ompt_set_callback"))
+    return (ompt_interface_fn_t)&ompt_multiplex_client_set_callback;
+  else if (!strcmp(name, "ompt_get_task_info"))
+    return (ompt_interface_fn_t)&ompt_multiplex_client_get_task_info;
+  else if (!strcmp(name, "ompt_get_thread_data"))
+    return (ompt_interface_fn_t)&ompt_multiplex_client_get_thread_data;
+  else if (!strcmp(name, "ompt_get_parallel_info"))
+    return (ompt_interface_fn_t)&ompt_multiplex_client_get_parallel_info;
+  else
+    return ompt_multiplex_lookup_function(name);
+}
+
+int ompt_multiplex_initialize(ompt_function_lookup_t lookup,
+                              int initial_device_num, ompt_data_t *data) {
+  ompt_multiplex_lookup_function = lookup;
+  ompt_multiplex_set_callback =
+      (ompt_set_callback_t)lookup("ompt_set_callback");
+  ompt_multiplex_get_task_info =
+      (ompt_get_task_info_t)lookup("ompt_get_task_info");
+  ompt_multiplex_get_thread_data =
+      (ompt_get_thread_data_t)lookup("ompt_get_thread_data");
+  ompt_multiplex_get_parallel_info =
+      (ompt_get_parallel_info_t)lookup("ompt_get_parallel_info");
+
+  // initialize ompt_multiplex_implementation_status
+#define ompt_event_macro(event_name, callback_type, event_id)                  \
+  ompt_multiplex_implementation_status.ompt_##event_name = -1
+
+  OMPT_LOAD_CLIENT_FOREACH_OMPT_EVENT(ompt_event_macro)
+
+#undef ompt_event_macro
+
+  int ownRet = ompt_multiplex_own_fns->initialize(
+      ompt_multiplex_own_lookup, initial_device_num,
+      &(ompt_multiplex_own_fns->tool_data));
+  int clientRet = 0;
+  if (ompt_multiplex_client_fns)
+    clientRet = ompt_multiplex_client_fns->initialize(
+        ompt_multiplex_client_lookup, initial_device_num,
+        &(ompt_multiplex_client_fns->tool_data));
+
+  return ownRet > clientRet ? ownRet : clientRet;
+}
+
+void ompt_multiplex_finalize(ompt_data_t *fns) {
+  if (ompt_multiplex_client_fns)
+    ompt_multiplex_client_fns->finalize(
+        &(ompt_multiplex_client_fns->tool_data));
+  ompt_multiplex_own_fns->finalize(&(ompt_multiplex_own_fns->tool_data));
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ompt_start_tool_result_t *
+ompt_multiplex_own_start_tool(unsigned int omp_version,
+                              const char *runtime_version);
+
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+                                          const char *runtime_version) {
+  // try loading client tool
+  ompt_multiplex_client_fns = NULL;
+  ompt_start_tool_result_t *(*client_start_tool)(unsigned int, const char *) =
+      NULL;
+
+  const char *tool_libs = getenv(CLIENT_TOOL_LIBRARIES_VAR);
+  if (tool_libs) {
+    // copy environement variable
+    char *tool_libs_buffer = strdup(tool_libs);
+    if (!tool_libs_buffer) {
+      printf("strdup Error (%i)\n", errno);
+      exit(-1);
+    }
+
+    int progress = 0;
+    while (progress < strlen(tool_libs)) {
+      int tmp_progress = progress;
+      while (tmp_progress < strlen(tool_libs) &&
+             tool_libs_buffer[tmp_progress] != ':')
+        tmp_progress++;
+      if (tmp_progress < strlen(tool_libs))
+        tool_libs_buffer[tmp_progress] = 0;
+      void *h = dlopen(tool_libs_buffer + progress, RTLD_LAZY);
+      if (h) {
+        client_start_tool =
+            (ompt_start_tool_result_t * (*)(unsigned int, const char *))
+                dlsym(h, "ompt_start_tool");
+        if (client_start_tool &&
+            (ompt_multiplex_client_fns =
+                 (*client_start_tool)(omp_version, runtime_version))) {
+          break;
+        }
+      } else {
+        printf("Loading %s from %s failed with: %s\n",
+               tool_libs_buffer + progress, CLIENT_TOOL_LIBRARIES_VAR,
+               dlerror());
+      }
+      progress = tmp_progress + 1;
+    }
+    free(tool_libs_buffer);
+  }
+  // load own tool
+  ompt_multiplex_own_fns =
+      ompt_multiplex_own_start_tool(omp_version, runtime_version);
+
+  // return multiplexed versions
+  static ompt_start_tool_result_t ompt_start_tool_result = {
+      &ompt_multiplex_initialize, &ompt_multiplex_finalize, {0}};
+  return &ompt_start_tool_result;
+}
+#ifdef __cplusplus
+}
+#endif
+
+// We rename the ompt_start_tool function of the OMPT tool and call the
+// renamed function from the ompt_start_tool function defined above.
+#define ompt_start_tool ompt_multiplex_own_start_tool
+
+#endif /* OMPT_MULTIPLEX_H */
diff --git a/tools/multiplex/tests/CMakeLists.txt b/tools/multiplex/tests/CMakeLists.txt
new file mode 100644
index 000000000..b7c1dfdba
--- /dev/null
+++ b/tools/multiplex/tests/CMakeLists.txt
@@ -0,0 +1,21 @@
+# CMakeLists.txt file for unit testing OMPT multiplex header.
+include(CheckFunctionExists)
+include(CheckLibraryExists)
+
+macro(pythonize_bool var)
+  if (${var})
+    set(${var} True)
+  else()
+    set(${var} False)
+  endif()
+endmacro()
+
+set(OMPT_LOAD_CLIENT_TEST_CFLAGS "" CACHE STRING
+  "Extra compiler flags to send to the test compiler")
+
+get_target_property(OMPT_PRINT_CALLBACKS_DIR bolt-ompt-print-callback INTERFACE_INCLUDE_DIRECTORIES)
+add_openmp_testsuite(check-bolt-ompt-multiplex "Running OMPT multiplex tests" ${CMAKE_CURRENT_BINARY_DIR} DEPENDS bolt-omp)
+
+# Configure the lit.site.cfg.in file
+set(AUTO_GEN_COMMENT "## Autogenerated by OMPT_LOAD_CLIENT configuration.\n# Do not edit!")
+configure_file(lit.site.cfg.in lit.site.cfg @ONLY)
diff --git a/tools/multiplex/tests/custom_data_storage/custom_data_storage.c b/tools/multiplex/tests/custom_data_storage/custom_data_storage.c
new file mode 100644
index 000000000..96d9a7584
--- /dev/null
+++ b/tools/multiplex/tests/custom_data_storage/custom_data_storage.c
@@ -0,0 +1,317 @@
+// RUN: %libomp-tool -DFIRST_TOOL -o %t.first.tool.so %s && \
+// RUN: %libomp-tool -DSECOND_TOOL -o %t.second.tool.so %s && \
+// RUN: %libomp-compile && \
+// RUN: env OMP_TOOL_LIBRARIES=%t.first.tool.so \
+// RUN: CUSTOM_DATA_STORAGE_TOOL_LIBRARIES=%t.second.tool.so \
+// RUN: %libomp-run | %sort-threads | FileCheck %s
+
+// For GCC we don't get an event for master,
+// see runtime/test/ompt/sycnchronization/master.c
+// UNSUPPORTED: gcc
+
+#if defined(FIRST_TOOL)
+#include "first-tool.h"
+#elif defined(SECOND_TOOL)
+#include "second-tool.h"
+#else /* APP */
+
+#include "../ompt-signal.h"
+#include "omp.h"
+#include <stdio.h>
+
+int main() {
+  int x, s = 0;
+#pragma omp parallel num_threads(2) shared(s)
+  {
+#pragma omp master
+    {
+#pragma omp task shared(s)
+      {
+        omp_control_tool(5, 1, NULL);
+        OMPT_SIGNAL(s);
+      }
+    }
+    if (omp_get_thread_num() == 1)
+      OMPT_WAIT(s, 1);
+  }
+  return 0;
+}
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback
+
+// CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+// CHECK: {{^}}0: NULL_POINTER=[[NULL]]
+// CHECK: {{^}}0: ompt_event_runtime_shutdown
+// CHECK: {{^}}0: ompt_event_runtime_shutdown
+
+// CHECK: {{^}}[[_1ST_MSTR_TID:[0-9]+]]: _first_tool: ompt_event_thread_begin:
+// CHECK-SAME: thread_type=ompt_thread_initial=1,
+// CHECK-SAME: thread_id=[[_1ST_MSTR_TID]]
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_initial_task_begin:
+// CHECK-SAME: parallel_id=[[_FIRST_INIT_PARALLEL_ID:[0-9]+]],
+// CHECK-SAME: task_id=[[_FIRST_INITIAL_TASK_ID:[0-9]+]], actual_parallelism=1,
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_parallel_begin:
+// CHECK-SAME: parent_task_id=[[_FIRST_INITIAL_TASK_ID]],
+// CHECK-SAME: parent_task_frame.exit=(nil),
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID:[0-9]+]], requested_team_size=2,
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}, invoker
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_implicit_task_begin:
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID:[0-9]+]], team_size=2,
+// CHECK-SAME: thread_num=0
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_masked_begin:
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_task_create:
+// CHECK-SAME: parent_task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: parent_task_frame.exit={{0x[0-f]+}},
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: new_task_id=[[_FIRST_EXPLICIT_TASK_ID:[0-9]+]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit=4,
+// CHECK-SAME: has_dependences=no
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_masked_end:
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_barrier_begin:
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_wait_barrier_begin:
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_task_schedule:
+// CHECK-SAME: first_task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: second_task_id=[[_FIRST_EXPLICIT_TASK_ID]],
+// CHECK-SAME: prior_task_status=ompt_task_switch=7
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_control_tool:
+// CHECK-SAME: command=5, modifier=1, arg=(nil),
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: task level 0:
+// CHECK-SAME: task_id=[[_FIRST_EXPLICIT_TASK_ID]]
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: task level 1:
+// CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]]
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: task level 2:
+// CHECK-SAME: task_id=[[_FIRST_INITIAL_TASK_ID]]
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]:
+// CHECK-SAME: _first_tool: parallel level 0: parallel_id=[[_FIRST_PARALLEL_ID]]
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: parallel level 1:
+// CHECK-SAME: parallel_id={{[0-9]+}}
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]:
+// CHECK-SAME: _first_tool: ompt_event_task_schedule:
+// CHECK-SAME: first_task_id=[[_FIRST_EXPLICIT_TASK_ID]],
+// CHECK-SAME: second_task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: prior_task_status=ompt_task_complete=1
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_task_end:
+// CHECK-SAME: task_id=[[_FIRST_EXPLICIT_TASK_ID]]
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_wait_barrier_end:
+// CHECK-SAME: parallel_id=0,
+// CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]], codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_barrier_end:
+// CHECK-SAME: parallel_id=0,
+// CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]], codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_implicit_task_end:
+// CHECK-SAME: parallel_id=0, task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: team_size=2, thread_num=0
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_parallel_end:
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[_FIRST_INITIAL_TASK_ID]], invoker
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_thread_end:
+// CHECK-SAME: thread_id=[[_1ST_MSTR_TID]]
+
+// CHECK: {{^}}[[_2ND_MSTR_TID:[0-9]+]]: second_tool: ompt_event_thread_begin:
+// CHECK-SAME: thread_type=ompt_thread_initial=1,
+// CHECK-SAME: thread_id=[[_2ND_MSTR_TID]]
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_initial_task_begin:
+// CHECK-SAME: parallel_id=[[SECOND_INIT_PARALLEL_ID:[0-9]+]],
+// CHECK-SAME: task_id=[[SECOND_INITIAL_TASK_ID:[0-9]+]], actual_parallelism=1,
+// CHECK-SAME: index=1, flags=1
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_parallel_begin:
+// CHECK-SAME: parent_task_id=[[SECOND_INITIAL_TASK_ID]],
+// CHECK-SAME: parent_task_frame.exit=(nil),
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID:[0-9]+]], requested_team_size=2,
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}, invoker
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_implicit_task_begin:
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID:[0-9]+]], team_size=2,
+// CHECK-SAME: thread_num=0
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_masked_begin:
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_task_create:
+// CHECK-SAME: parent_task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: parent_task_frame.exit={{0x[0-f]+}},
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: new_task_id=[[SECOND_EXPLICIT_TASK_ID:[0-9]+]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit=4,
+// CHECK-SAME: has_dependences=no
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_masked_end:
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_barrier_begin:
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_wait_barrier_begin:
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_task_schedule:
+// CHECK-SAME: first_task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: second_task_id=[[SECOND_EXPLICIT_TASK_ID]],
+// CHECK-SAME: prior_task_status=ompt_task_switch=7
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_control_tool:
+// CHECK-SAME: command=5, modifier=1, arg=(nil),
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: task level 0:
+// CHECK-SAME: task_id=[[SECOND_EXPLICIT_TASK_ID]]
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: task level 1:
+// CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]]
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: task level 2:
+// CHECK-SAME: task_id=[[SECOND_INITIAL_TASK_ID]]
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]:
+// CHECK-SAME: second_tool: parallel level 0: parallel_id=[[SECOND_PARALLEL_ID]]
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: parallel level 1:
+// CHECK-SAME: parallel_id={{[0-9]+}}
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]:
+// CHECK-SAME: second_tool: ompt_event_task_schedule:
+// CHECK-SAME: first_task_id=[[SECOND_EXPLICIT_TASK_ID]],
+// CHECK-SAME: second_task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: prior_task_status=ompt_task_complete=1
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_task_end:
+// CHECK-SAME: task_id=[[SECOND_EXPLICIT_TASK_ID]]
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_wait_barrier_end:
+// CHECK-SAME: parallel_id=0,
+// CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]], codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_barrier_end:
+// CHECK-SAME: parallel_id=0,
+// CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]], codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_implicit_task_end:
+// CHECK-SAME: parallel_id=0,
+// CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]], team_size=2,
+// CHECK-SAME: thread_num=0
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_parallel_end:
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[SECOND_INITIAL_TASK_ID]], invoker
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_thread_end:
+// CHECK-SAME: thread_id=[[_2ND_MSTR_TID]]
+
+// CHECK: {{^}}[[_1ST_WRKR_TID:[0-9]+]]: _first_tool: ompt_event_thread_begin:
+// CHECK-SAME: thread_type=ompt_thread_worker=2,
+// CHECK-SAME: thread_id=[[_1ST_WRKR_TID]]
+
+// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_implicit_task_begin:
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID:[0-9]+]], team_size=2,
+// CHECK-SAME: thread_num=1
+
+// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_barrier_begin:
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_wait_barrier_begin:
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_wait_barrier_end:
+// CHECK-SAME: parallel_id=0,
+// CHECK-SAME: task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_barrier_end:
+// CHECK-SAME: parallel_id=0,
+// CHECK-SAME: task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_implicit_task_end:
+// CHECK-SAME: parallel_id=0,
+// CHECK-SAME: task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID]], team_size=0,
+// thread_num=1
+
+// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_thread_end:
+// CHECK-SAME: thread_id=[[_1ST_WRKR_TID]]
+
+// CHECK: {{^}}[[_2ND_WRKR_TID:[0-9]+]]: second_tool: ompt_event_thread_begin:
+// CHECK-SAME: thread_type=ompt_thread_worker=2,
+// CHECK-SAME: thread_id=[[_2ND_WRKR_TID]]
+
+// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_implicit_task_begin:
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID:[0-9]+]], team_size=2,
+// CHECK-SAME: thread_num=1
+
+// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_barrier_begin:
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_wait_barrier_begin:
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_wait_barrier_end:
+// CHECK-SAME: parallel_id=0,
+// CHECK-SAME: task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_barrier_end:
+// CHECK-SAME: parallel_id=0,
+// CHECK-SAME: task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_implicit_task_end:
+// CHECK-SAME: parallel_id=0,
+// CHECK-SAME: task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID]], team_size=0,
+// CHECK-SAME: thread_num=1
+
+// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_thread_end:
+// CHECK-SAME: thread_id=[[_2ND_WRKR_TID]]
+
+#endif /* APP */
diff --git a/tools/multiplex/tests/custom_data_storage/first-tool.h b/tools/multiplex/tests/custom_data_storage/first-tool.h
new file mode 100644
index 000000000..a553ab8d7
--- /dev/null
+++ b/tools/multiplex/tests/custom_data_storage/first-tool.h
@@ -0,0 +1,293 @@
+#include "omp-tools.h"
+
+#define ompt_start_tool disable_ompt_start_tool
+#define _TOOL_PREFIX " _first_tool:"
+#include "callback.h"
+#undef _TOOL_PREFIX
+#undef ompt_start_tool
+
+#define CLIENT_TOOL_LIBRARIES_VAR "CUSTOM_DATA_STORAGE_TOOL_LIBRARIES"
+static ompt_data_t *custom_get_client_ompt_data(ompt_data_t *);
+static void free_data_pair(ompt_data_t *);
+#define OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_THREAD_DATA custom_get_client_ompt_data
+#define OMPT_MULTIPLEX_CUSTOM_DELETE_THREAD_DATA free_data_pair
+#define OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_PARALLEL_DATA                         \
+  custom_get_client_ompt_data
+#define OMPT_MULTIPLEX_CUSTOM_DELETE_PARALLEL_DATA free_data_pair
+#define OMPT_MULTIPLEX_CUSTOM_GET_CLIENT_TASK_DATA custom_get_client_ompt_data
+#define OMPT_MULTIPLEX_CUSTOM_DELETE_TASK_DATA free_data_pair
+#include "ompt-multiplex.h"
+
+typedef struct custom_data_pair_s {
+  ompt_data_t own_data;
+  ompt_data_t client_data;
+} custom_data_pair_t;
+
+static ompt_data_t *custom_get_client_ompt_data(ompt_data_t *data) {
+  if (data)
+    return &(((custom_data_pair_t *)(data->ptr))->client_data);
+  else
+    return NULL;
+}
+
+static ompt_data_t *get_own_ompt_data(ompt_data_t *data) {
+  if (data)
+    return &(((custom_data_pair_t *)(data->ptr))->own_data);
+  else
+    return NULL;
+}
+
+static ompt_multiplex_data_pair_t *
+allocate_data_pair(ompt_data_t *data_pointer) {
+  data_pointer->ptr = malloc(sizeof(ompt_multiplex_data_pair_t));
+  if (!data_pointer->ptr) {
+    printf("Malloc ERROR\n");
+    exit(-1);
+  }
+  ompt_multiplex_data_pair_t *data_pair =
+      (ompt_multiplex_data_pair_t *)data_pointer->ptr;
+  data_pair->own_data.ptr = NULL;
+  data_pair->client_data.ptr = NULL;
+  return data_pair;
+}
+
+static void free_data_pair(ompt_data_t *data_pointer) {
+  free((*data_pointer).ptr);
+}
+
+static void on_cds_ompt_callback_sync_region(ompt_sync_region_t kind,
+                                             ompt_scope_endpoint_t endpoint,
+                                             ompt_data_t *parallel_data,
+                                             ompt_data_t *task_data,
+                                             const void *codeptr_ra) {
+  parallel_data = get_own_ompt_data(parallel_data);
+  task_data = get_own_ompt_data(task_data);
+  on_ompt_callback_sync_region(kind, endpoint, parallel_data, task_data,
+                               codeptr_ra);
+}
+
+static void on_cds_ompt_callback_sync_region_wait(
+    ompt_sync_region_t kind, ompt_scope_endpoint_t endpoint,
+    ompt_data_t *parallel_data, ompt_data_t *task_data,
+    const void *codeptr_ra) {
+  parallel_data = get_own_ompt_data(parallel_data);
+  task_data = get_own_ompt_data(task_data);
+  on_ompt_callback_sync_region_wait(kind, endpoint, parallel_data, task_data,
+                                    codeptr_ra);
+}
+
+static void on_cds_ompt_callback_flush(ompt_data_t *thread_data,
+                                       const void *codeptr_ra) {
+  thread_data = get_own_ompt_data(thread_data);
+  on_cds_ompt_callback_flush(thread_data, codeptr_ra);
+}
+
+static void on_cds_ompt_callback_cancel(ompt_data_t *task_data, int flags,
+                                        const void *codeptr_ra) {
+  task_data = get_own_ompt_data(task_data);
+  on_ompt_callback_cancel(task_data, flags, codeptr_ra);
+}
+
+static void on_cds_ompt_callback_implicit_task(ompt_scope_endpoint_t endpoint,
+                                               ompt_data_t *parallel_data,
+                                               ompt_data_t *task_data,
+                                               unsigned int team_size,
+                                               unsigned int thread_num,
+                                               int type) {
+  if (endpoint == ompt_scope_begin && (type & ompt_task_initial)) {
+    allocate_data_pair(parallel_data);
+  }
+  if (endpoint == ompt_scope_begin) {
+    allocate_data_pair(task_data);
+  }
+  parallel_data = get_own_ompt_data(parallel_data);
+  task_data = get_own_ompt_data(task_data);
+  on_ompt_callback_implicit_task(endpoint, parallel_data, task_data, team_size,
+                                 thread_num, type);
+}
+
+static void on_cds_ompt_callback_work(ompt_work_t wstype,
+                                      ompt_scope_endpoint_t endpoint,
+                                      ompt_data_t *parallel_data,
+                                      ompt_data_t *task_data, uint64_t count,
+                                      const void *codeptr_ra) {
+  parallel_data = get_own_ompt_data(parallel_data);
+  task_data = get_own_ompt_data(task_data);
+  on_ompt_callback_work(wstype, endpoint, parallel_data, task_data, count,
+                        codeptr_ra);
+}
+
+static void on_cds_ompt_callback_master(ompt_scope_endpoint_t endpoint,
+                                        ompt_data_t *parallel_data,
+                                        ompt_data_t *task_data,
+                                        const void *codeptr_ra) {
+  parallel_data = get_own_ompt_data(parallel_data);
+  task_data = get_own_ompt_data(task_data);
+  on_ompt_callback_masked(endpoint, parallel_data, task_data, codeptr_ra);
+}
+
+static void on_cds_ompt_callback_parallel_begin(
+    ompt_data_t *parent_task_data, const ompt_frame_t *parent_task_frame,
+    ompt_data_t *parallel_data, uint32_t requested_team_size, int invoker,
+    const void *codeptr_ra) {
+  parent_task_data = get_own_ompt_data(parent_task_data);
+  if (parallel_data->ptr)
+    printf("%s\n", "0: parallel_data initially not null");
+  allocate_data_pair(parallel_data);
+  parallel_data = get_own_ompt_data(parallel_data);
+  on_ompt_callback_parallel_begin(parent_task_data, parent_task_frame,
+                                  parallel_data, requested_team_size, invoker,
+                                  codeptr_ra);
+}
+
+static void on_cds_ompt_callback_parallel_end(ompt_data_t *parallel_data,
+                                              ompt_data_t *task_data,
+                                              int invoker,
+                                              const void *codeptr_ra) {
+  task_data = get_own_ompt_data(task_data);
+  parallel_data = get_own_ompt_data(parallel_data);
+  on_ompt_callback_parallel_end(parallel_data, task_data, invoker, codeptr_ra);
+}
+
+static void on_cds_ompt_callback_task_create(ompt_data_t *parent_task_data,
+                                             const ompt_frame_t *parent_frame,
+                                             ompt_data_t *new_task_data,
+                                             int type, int has_dependences,
+                                             const void *codeptr_ra) {
+  parent_task_data = get_own_ompt_data(parent_task_data);
+  if (new_task_data->ptr)
+    printf("%s\n", "0: new_task_data initially not null");
+  allocate_data_pair(new_task_data);
+  new_task_data = get_own_ompt_data(new_task_data);
+  on_ompt_callback_task_create(parent_task_data, parent_frame, new_task_data,
+                               type, has_dependences, codeptr_ra);
+}
+
+static void
+on_cds_ompt_callback_task_schedule(ompt_data_t *first_task_data,
+                                   ompt_task_status_t prior_task_status,
+                                   ompt_data_t *second_task_data) {
+  ompt_data_t *original_first_task_data = first_task_data;
+  first_task_data = get_own_ompt_data(first_task_data);
+  second_task_data = get_own_ompt_data(second_task_data);
+  on_ompt_callback_task_schedule(first_task_data, prior_task_status,
+                                 second_task_data);
+}
+
+static void on_cds_ompt_callback_dependences(ompt_data_t *task_data,
+                                             const ompt_dependence_t *deps,
+                                             int ndeps) {
+  task_data = get_own_ompt_data(task_data);
+  on_ompt_callback_dependences(task_data, deps, ndeps);
+}
+
+static void
+on_cds_ompt_callback_task_dependence(ompt_data_t *first_task_data,
+                                     ompt_data_t *second_task_data) {
+  first_task_data = get_own_ompt_data(first_task_data);
+  second_task_data = get_own_ompt_data(second_task_data);
+  on_ompt_callback_task_dependence(first_task_data, second_task_data);
+}
+
+static void on_cds_ompt_callback_thread_begin(ompt_thread_t thread_type,
+                                              ompt_data_t *thread_data) {
+  if (thread_data->ptr)
+    printf("%s\n", "0: thread_data initially not null");
+  allocate_data_pair(thread_data);
+  thread_data = get_own_ompt_data(thread_data);
+  on_ompt_callback_thread_begin(thread_type, thread_data);
+}
+
+static void on_cds_ompt_callback_thread_end(ompt_data_t *thread_data) {
+  thread_data = get_own_ompt_data(thread_data);
+  on_ompt_callback_thread_end(thread_data);
+}
+
+static int on_cds_ompt_callback_control_tool(uint64_t command,
+                                             uint64_t modifier, void *arg,
+                                             const void *codeptr_ra) {
+  printf("%" PRIu64 ": _first_tool: ompt_event_control_tool: command=%" PRIu64
+         ", modifier=%" PRIu64 ", arg=%p, codeptr_ra=%p \n",
+         ompt_get_thread_data()->value, command, modifier, arg, codeptr_ra);
+
+  // print task data
+  int task_level = 0;
+  ompt_data_t *task_data;
+  while (ompt_get_task_info(task_level, NULL, (ompt_data_t **)&task_data, NULL,
+                            NULL, NULL)) {
+    task_data = get_own_ompt_data(task_data);
+    printf("%" PRIu64 ": _first_tool: task level %d: task_id=%" PRIu64 "\n",
+           ompt_get_thread_data()->value, task_level, task_data->value);
+    task_level++;
+  }
+
+  // print parallel data
+  int parallel_level = 0;
+  ompt_data_t *parallel_data;
+  while (ompt_get_parallel_info(parallel_level, (ompt_data_t **)&parallel_data,
+                                NULL)) {
+    parallel_data = get_own_ompt_data(parallel_data);
+    printf("%" PRIu64 ": _first_tool: parallel level %d: parallel_id=%" PRIu64
+           "\n",
+           ompt_get_thread_data()->value, parallel_level, parallel_data->value);
+    parallel_level++;
+  }
+  return 0; // success
+}
+
+static ompt_get_thread_data_t ompt_cds_get_thread_data;
+ompt_data_t *ompt_get_own_thread_data() {
+  return get_own_ompt_data(ompt_cds_get_thread_data());
+}
+
+#define register_callback2_t(name, type)                                       \
+  do {                                                                         \
+    type f_##name = &on_cds_##name;                                            \
+    if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never)  \
+      printf("0: Could not register callback '" #name "'\n");                  \
+  } while (0)
+
+#define register_callback2(name) register_callback2_t(name, name##_t)
+
+int ompt_cds_initialize(ompt_function_lookup_t lookup, int initial_device_num,
+                        ompt_data_t *tool_data) {
+  ompt_initialize(lookup, initial_device_num, tool_data);
+  ompt_cds_get_thread_data = ompt_get_thread_data;
+  ompt_get_thread_data = ompt_get_own_thread_data;
+
+  register_callback(ompt_callback_mutex_acquire);
+  register_callback_t(ompt_callback_mutex_acquired, ompt_callback_mutex_t);
+  register_callback_t(ompt_callback_mutex_released, ompt_callback_mutex_t);
+  register_callback(ompt_callback_nest_lock);
+  register_callback2(ompt_callback_sync_region);
+  register_callback2_t(ompt_callback_sync_region_wait,
+                       ompt_callback_sync_region_t);
+  register_callback2(ompt_callback_control_tool);
+  register_callback2(ompt_callback_flush);
+  register_callback2(ompt_callback_cancel);
+  register_callback2(ompt_callback_implicit_task);
+  register_callback_t(ompt_callback_lock_init, ompt_callback_mutex_acquire_t);
+  register_callback_t(ompt_callback_lock_destroy, ompt_callback_mutex_t);
+  register_callback2(ompt_callback_work);
+  register_callback2(ompt_callback_master);
+  register_callback2(ompt_callback_parallel_begin);
+  register_callback2(ompt_callback_parallel_end);
+  register_callback2(ompt_callback_task_create);
+  register_callback2(ompt_callback_task_schedule);
+  register_callback2(ompt_callback_dependences);
+  register_callback2(ompt_callback_task_dependence);
+  register_callback2(ompt_callback_thread_begin);
+  register_callback2(ompt_callback_thread_end);
+  return 1; // success
+}
+
+void ompt_cds_finalize(ompt_data_t *tool_data) {
+  printf("0: ompt_event_runtime_shutdown\n");
+}
+
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+                                          const char *runtime_version) {
+  static ompt_start_tool_result_t ompt_start_tool_result = {
+      &ompt_cds_initialize, &ompt_cds_finalize, 0};
+  return &ompt_start_tool_result;
+}
diff --git a/tools/multiplex/tests/custom_data_storage/second-tool.h b/tools/multiplex/tests/custom_data_storage/second-tool.h
new file mode 100644
index 000000000..4c0f39e49
--- /dev/null
+++ b/tools/multiplex/tests/custom_data_storage/second-tool.h
@@ -0,0 +1,5 @@
+#define CLIENT_TOOL_LIBRARIES_VAR "PRINT_EMBEDDED_TOOL_LIBRARIES"
+#include "ompt-multiplex.h"
+#define _TOOL_PREFIX " second_tool:"
+#include "callback.h"
+#undef _TOOL_PREFIX
diff --git a/tools/multiplex/tests/lit.cfg b/tools/multiplex/tests/lit.cfg
new file mode 100644
index 000000000..52c1000aa
--- /dev/null
+++ b/tools/multiplex/tests/lit.cfg
@@ -0,0 +1,93 @@
+# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79:
+# Configuration file for the 'lit' test runner.
+
+import os
+import re
+import subprocess
+import lit.formats
+
+# Tell pylint that we know config and lit_config exist somewhere.
+if 'PYLINT_IMPORT' in os.environ:
+    config = object()
+    lit_config = object()
+
+def append_dynamic_library_path(path):
+    if config.operating_system == 'Windows':
+        name = 'PATH'
+        sep = ';'
+    elif config.operating_system == 'Darwin':
+        name = 'DYLD_LIBRARY_PATH'
+        sep = ':'
+    else:
+        name = 'LD_LIBRARY_PATH'
+        sep = ':'
+    if name in config.environment:
+        config.environment[name] = path + sep + config.environment[name]
+    else:
+        config.environment[name] = path
+
+# name: The name of this test suite.
+config.name = 'OMPT multiplex'
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.c']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root object directory where output is placed
+config.test_exec_root = config.test_obj_root
+
+# test format
+config.test_format = lit.formats.ShTest()
+
+# compiler flags
+config.test_flags = " -I " + config.test_source_root + "/.."\
+    " -I " + config.omp_header_dir + \
+    " -L " + config.omp_library_dir + \
+    " -I " + config.ompt_print_callback_dir + \
+    " -Wl,-rpath," + config.omp_library_dir + \
+    " " + config.test_openmp_flags + \
+    " " + config.test_extra_flags
+
+# Allow XFAIL to work
+config.target_triple = [ ]
+for feature in config.test_compiler_features:
+    config.available_features.add(feature)
+
+# Setup environment to find dynamic library at runtime
+append_dynamic_library_path(config.omp_library_dir)
+append_dynamic_library_path(config.test_obj_root+"/..")
+
+# Rpath modifications for Darwin
+if config.operating_system == 'Darwin':
+    config.test_flags += " -Wl,-rpath," + config.omp_library_dir
+
+# Find the SDK on Darwin
+if config.operating_system == 'Darwin':
+  cmd = subprocess.Popen(['xcrun', '--show-sdk-path'],
+                         stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+  out, err = cmd.communicate()
+  out = out.strip()
+  res = cmd.wait()
+  if res == 0 and out:
+    config.test_flags += " -isysroot " + out
+
+if 'Linux' in config.operating_system:
+    config.available_features.add("linux")
+
+# substitutions
+config.substitutions.append(("FileCheck", "tee %%t.out | %s" % config.test_filecheck))
+config.substitutions.append(("%sort-threads", "sort --numeric-sort --stable"))
+
+config.substitutions.append(("%libomp-compile-and-run", \
+    "%libomp-compile && %libomp-run"))
+config.substitutions.append(("%libomp-compile", \
+    "%clang %cflags %s -o %t"))
+config.substitutions.append(("%libomp-tool", \
+    "%clang %cflags -shared -fPIC -g"))
+config.substitutions.append(("%libomp-run", "%t"))
+config.substitutions.append(("%clang", config.test_c_compiler))
+config.substitutions.append(("%openmp_flag", config.test_openmp_flags))
+config.substitutions.append(("%cflags", config.test_flags))
+
diff --git a/tools/multiplex/tests/lit.site.cfg.in b/tools/multiplex/tests/lit.site.cfg.in
new file mode 100644
index 000000000..dbe7a3329
--- /dev/null
+++ b/tools/multiplex/tests/lit.site.cfg.in
@@ -0,0 +1,16 @@
+@AUTO_GEN_COMMENT@
+
+config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
+config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
+config.test_compiler_features = @OPENMP_TEST_COMPILER_FEATURES@
+config.test_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
+config.test_openmp_flags = "@OPENMP_TEST_OPENMP_FLAGS@"
+config.test_extra_flags = "@OPENMP_TEST_FLAGS@"
+config.test_obj_root = "@CMAKE_CURRENT_BINARY_DIR@"
+config.omp_library_dir = "@LIBOMP_LIBRARY_DIR@"
+config.omp_header_dir = "@LIBOMP_INCLUDE_DIR@"
+config.ompt_print_callback_dir = "@OMPT_PRINT_CALLBACKS_DIR@"
+config.operating_system = "@CMAKE_SYSTEM_NAME@"
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")
diff --git a/tools/multiplex/tests/ompt-signal.h b/tools/multiplex/tests/ompt-signal.h
new file mode 100644
index 000000000..9933dbfc3
--- /dev/null
+++ b/tools/multiplex/tests/ompt-signal.h
@@ -0,0 +1,23 @@
+// These functions are used to provide a signal-wait mechanism to enforce
+// expected scheduling for the test cases. Conditional variable (s) needs to be
+// shared! Initialize to 0
+#include <unistd.h>
+
+#define OMPT_SIGNAL(s) ompt_signal(&s)
+// inline
+void ompt_signal(int *s) {
+#pragma omp atomic
+  (*s)++;
+}
+
+#define OMPT_WAIT(s, v) ompt_wait(&s, v)
+// wait for s >= v
+// inline
+void ompt_wait(int *s, int v) {
+  int wait = 0;
+  do {
+    usleep(10);
+#pragma omp atomic read
+    wait = (*s);
+  } while (wait < v);
+}
diff --git a/tools/multiplex/tests/print/first-tool.h b/tools/multiplex/tests/print/first-tool.h
new file mode 100644
index 000000000..acd957264
--- /dev/null
+++ b/tools/multiplex/tests/print/first-tool.h
@@ -0,0 +1,5 @@
+#define CLIENT_TOOL_LIBRARIES_VAR "PRINT_TOOL_LIBRARIES"
+#include "ompt-multiplex.h"
+#define _TOOL_PREFIX " _first_tool:"
+#include "callback.h"
+#undef _TOOL_PREFIX
diff --git a/tools/multiplex/tests/print/print.c b/tools/multiplex/tests/print/print.c
new file mode 100644
index 000000000..c492899c6
--- /dev/null
+++ b/tools/multiplex/tests/print/print.c
@@ -0,0 +1,308 @@
+// RUN: %libomp-tool -DFIRST_TOOL -o %t.first.tool.so %s && \
+// RUN: %libomp-tool -DSECOND_TOOL -o %t.second.tool.so %s && \
+// RUN: %libomp-compile && \
+// RUN: env OMP_TOOL_LIBRARIES=%t.first.tool.so \
+// RUN: PRINT_TOOL_LIBRARIES=%t.second.tool.so \
+// RUN: %libomp-run | %sort-threads | FileCheck %s
+
+// For GCC we don't get an event for master,
+// see runtime/test/ompt/sycnchronization/master.c
+// UNSUPPORTED: gcc
+
+#if defined(FIRST_TOOL)
+#include "first-tool.h"
+#elif defined(SECOND_TOOL)
+#include "second-tool.h"
+#else /* APP */
+
+#include "../ompt-signal.h"
+#include "omp.h"
+#include <stdio.h>
+
+int main() {
+  int x, s = 0;
+#pragma omp parallel num_threads(2) shared(s)
+  {
+#pragma omp master
+    {
+#pragma omp task shared(s)
+      {
+        omp_control_tool(5, 1, NULL);
+        OMPT_SIGNAL(s);
+      }
+    }
+    if (omp_get_thread_num() == 1)
+      OMPT_WAIT(s, 1);
+  }
+  return 0;
+}
+
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback
+
+// CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+// CHECK: {{^}}0: NULL_POINTER=[[NULL]]
+// CHECK: {{^}}0: ompt_event_runtime_shutdown
+// CHECK: {{^}}0: ompt_event_runtime_shutdown
+
+// CHECK: {{^}}[[_1ST_MSTR_TID:[0-9]+]]: _first_tool: ompt_event_thread_begin:
+// CHECK-SAME: thread_type=ompt_thread_initial=1, thread_id=[[_1ST_MSTR_TID]]
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_initial_task_begin:
+// CHECK-SAME: parallel_id=[[_FIRST_INIT_PARALLEL_ID:[0-9]+]],
+// CHECK-SAME: task_id=[[_FIRST_INITIAL_TASK_ID:[0-9]+]],
+// CHECK-SAME: actual_parallelism=1, index=1, flags=1
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_parallel_begin:
+// CHECK-SAME: parent_task_id=[[_FIRST_INITIAL_TASK_ID]],
+// CHECK-SAME: parent_task_frame.exit=(nil),
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID:[0-9]+]],
+// CHECK-SAME: requested_team_size=2, codeptr_ra={{0x[0-f]+}}, invoker
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_implicit_task_begin:
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID:[0-9]+]], team_size=2,
+// CHECK-SAME: thread_num=0
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_masked_begin:
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_task_create:
+// CHECK-SAME: parent_task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: parent_task_frame.exit={{0x[0-f]+}},
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: new_task_id=[[_FIRST_EXPLICIT_TASK_ID:[0-9]+]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit=4,
+// CHECK-SAME: has_dependences=no
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_masked_end:
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_barrier_begin:
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_wait_barrier_begin:
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_task_schedule:
+// CHECK-SAME: first_task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: second_task_id=[[_FIRST_EXPLICIT_TASK_ID]],
+// CHECK-SAME: prior_task_status=ompt_task_switch=7
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_control_tool:
+// CHECK-SAME: command=5, modifier=1, arg=(nil), codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: task level 0:
+// CHECK-SAME: task_id=[[_FIRST_EXPLICIT_TASK_ID]]
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: task level 1:
+// CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]]
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: task level 2:
+// CHECK-SAME: task_id=[[_FIRST_INITIAL_TASK_ID]]
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: parallel level 0:
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]]
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: parallel level 1:
+// CHECK-SAME: parallel_id={{[0-9]+}}
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_task_schedule:
+// CHECK-SAME: first_task_id=[[_FIRST_EXPLICIT_TASK_ID]],
+// CHECK-SAME: second_task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: prior_task_status=ompt_task_complete=1
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_task_end:
+// CHECK-SAME: task_id=[[_FIRST_EXPLICIT_TASK_ID]]
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_wait_barrier_end:
+// CHECK-SAME: parallel_id=0, task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_barrier_end:
+// CHECK-SAME: parallel_id=0, task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_implicit_task_end:
+// CHECK-SAME: parallel_id=0, task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: team_size=2, thread_num=0
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_parallel_end:
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[_FIRST_INITIAL_TASK_ID]], invoker
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_thread_end:
+// CHECK-SAME: thread_id=[[_1ST_MSTR_TID]]
+// CHECK: {{^}}[[_2ND_MSTR_TID:[0-9]+]]: second_tool: ompt_event_thread_begin:
+// CHECK-SAME: thread_type=ompt_thread_initial=1, thread_id=[[_2ND_MSTR_TID]]
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_initial_task_begin:
+// CHECK-SAME: parallel_id=[[SECOND_INIT_PARALLEL_ID:[0-9]+]],
+// CHECK-SAME: task_id=[[SECOND_INITIAL_TASK_ID:[0-9]+]],
+// CHECK-SAME: actual_parallelism=1, index=1, flags=1
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_parallel_begin:
+// CHECK-SAME: parent_task_id=[[SECOND_INITIAL_TASK_ID]],
+// CHECK-SAME: parent_task_frame.exit=(nil),
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID:[0-9]+]],
+// CHECK-SAME: requested_team_size=2, codeptr_ra={{0x[0-f]+}}, invoker
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_implicit_task_begin:
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID:[0-9]+]], team_size=2,
+// CHECK-SAME: thread_num=0
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_masked_begin:
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_task_create:
+// CHECK-SAME: parent_task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: parent_task_frame.exit={{0x[0-f]+}},
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: new_task_id=[[SECOND_EXPLICIT_TASK_ID:[0-9]+]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit=4,
+// CHECK-SAME: has_dependences=no
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_masked_end:
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_barrier_begin:
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_wait_barrier_begin:
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_task_schedule:
+// CHECK-SAME: first_task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: second_task_id=[[SECOND_EXPLICIT_TASK_ID]],
+// CHECK-SAME: prior_task_status=ompt_task_switch=7
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_control_tool:
+// CHECK-SAME: command=5, modifier=1, arg=(nil), codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: task level 0:
+// CHECK-SAME: task_id=[[SECOND_EXPLICIT_TASK_ID]]
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: task level 1:
+// CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]]
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: task level 2:
+// CHECK-SAME: task_id=[[SECOND_INITIAL_TASK_ID]]
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: parallel level 0:
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]]
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: parallel level 1:
+// CHECK-SAME: parallel_id={{[0-9]+}}
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_task_schedule:
+// CHECK-SAME: first_task_id=[[SECOND_EXPLICIT_TASK_ID]],
+// CHECK-SAME: second_task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: prior_task_status=ompt_task_complete=1
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_task_end:
+// CHECK-SAME: task_id=[[SECOND_EXPLICIT_TASK_ID]]
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_wait_barrier_end:
+// CHECK-SAME: parallel_id=0, task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_barrier_end:
+// CHECK-SAME: parallel_id=0, task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_implicit_task_end:
+// CHECK-SAME: parallel_id=0, task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: team_size=2, thread_num=0
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_parallel_end:
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[SECOND_INITIAL_TASK_ID]], invoker
+// CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+
+// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_thread_end:
+// CHECK-SAME: thread_id=[[_2ND_MSTR_TID]]
+
+// CHECK: {{^}}[[_1ST_WRKR_TID:[0-9]+]]: _first_tool: ompt_event_thread_begin:
+// CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[_1ST_WRKR_TID]]
+
+// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_implicit_task_begin:
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID:[0-9]+]], team_size=2,
+// CHECK-SAME: thread_num=1
+
+// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_barrier_begin:
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_wait_barrier_begin:
+// CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_wait_barrier_end:
+// CHECK-SAME: parallel_id=0, task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_barrier_end:
+// CHECK-SAME: parallel_id=0, task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_implicit_task_end:
+// CHECK-SAME: parallel_id=0, task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: team_size=0, thread_num=1
+
+// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_thread_end:
+// CHECK-SAME: thread_id=[[_1ST_WRKR_TID]]
+
+// CHECK: {{^}}[[_2ND_WRKR_TID:[0-9]+]]: second_tool: ompt_event_thread_begin:
+// CHECK-SAME: thread_type=ompt_thread_worker=2,
+// CHECK-SAME: thread_id=[[_2ND_WRKR_TID]]
+
+// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_implicit_task_begin:
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID:[0-9]+]],
+// CHECK-SAME: team_size=2, thread_num=1
+
+// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_barrier_begin:
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_wait_barrier_begin:
+// CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]],
+// CHECK-SAME: task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_wait_barrier_end:
+// CHECK-SAME: parallel_id=0, task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_barrier_end:
+// CHECK-SAME: parallel_id=0, task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: codeptr_ra=(nil)
+
+// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_implicit_task_end:
+// CHECK-SAME: parallel_id=0, task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID]],
+// CHECK-SAME: team_size=0, thread_num=1
+
+// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_thread_end:
+// CHECK-SAME: thread_id=[[_2ND_WRKR_TID]]
+
+#endif /* APP */
diff --git a/tools/multiplex/tests/print/second-tool.h b/tools/multiplex/tests/print/second-tool.h
new file mode 100644
index 000000000..4c0f39e49
--- /dev/null
+++ b/tools/multiplex/tests/print/second-tool.h
@@ -0,0 +1,5 @@
+#define CLIENT_TOOL_LIBRARIES_VAR "PRINT_EMBEDDED_TOOL_LIBRARIES"
+#include "ompt-multiplex.h"
+#define _TOOL_PREFIX " second_tool:"
+#include "callback.h"
+#undef _TOOL_PREFIX
diff --git a/www/index.html b/www/index.html
index eb52e8a0f..2050e3913 100644
--- a/www/index.html
+++ b/www/index.html
@@ -12,7 +12,7 @@
 <body>
 <div id="menu">
   <div>
-    <a href="http://llvm.org/">LLVM Home</a>
+    <a href="https://llvm.org/">LLVM Home</a>
   </div>
 
   <div class="submenu">
@@ -22,9 +22,9 @@
 
   <div class="submenu">
     <label>Quick Links</label>
-    <a href="http://lists.llvm.org/mailman/listinfo/openmp-dev">openmp-dev</a>
-    <a href="http://lists.llvm.org/mailman/listinfo/openmp-commits">openmp-commits</a>
-    <a href="http://llvm.org/bugs/">Bug Reports</a>
+    <a href="https://lists.llvm.org/mailman/listinfo/openmp-dev">openmp-dev</a>
+    <a href="https://lists.llvm.org/mailman/listinfo/openmp-commits">openmp-commits</a>
+    <a href="https://bugs.llvm.org/">Bug Reports</a>
     <a href="https://github.com/llvm/llvm-project/tree/master/openmp">Browse Sources</a>
   </div>
 </div>
@@ -60,7 +60,7 @@ <h1>OpenMP&reg;: Support for the OpenMP language</h1>
   </p>
 
   <p>All of the code here is <a
-     href="http://llvm.org/docs/DeveloperPolicy.html#license">dual licensed</a>
+     href="https://llvm.org/docs/DeveloperPolicy.html#copyright-license-and-patents">dual licensed</a>
      under the MIT license and the UIUC License (a BSD-like license).
      The LICENSE.txt file at the top of the OpenMP project contains
      the license text and associated patent grants.
@@ -83,19 +83,19 @@ <h2 id="goals">Features and Goals</h2>
   <!--=====================================================================-->
 
     <ul>
-        <li>Support for the <a href="http://www.openmp.org/mp-documents/OpenMP3.1.pdf">OpenMP
+        <li>Support for the <a href="https://www.openmp.org/wp-content/uploads/OpenMP3.1.pdf">OpenMP
           3.1 standard (PDF)</a> has been achieved in the Clang 3.8.0
           release.
         </li>
 
         <li>Support for the
- <a href="http://www.openmp.org/mp-documents/OpenMP4.0.0.pdf">OpenMP
-          4.0 standard (PDF)</a> and <a href="http://www.openmp.org/mp-documents/OpenMP4.5.pdf">OpenMP
+ <a href="https://www.openmp.org/wp-content/uploads/OpenMP4.0.0.pdf">OpenMP
+          4.0 standard (PDF)</a> and <a href="https://www.openmp.org/wp-content/uploads/openmp-4.5.pdf">OpenMP
           4.5 standard (PDF)</a> is now being implemented. (Some OpenMP 4.0
           and 4.5 features are already available).
         <li>High performance.</li>
-        <li>ABI compatibility with <a href="http://gcc.gnu.org">Gcc</a> and
-        <a href="http://software.intel.com/en-us/intel-compilers">Intel's
+        <li>ABI compatibility with <a href="https://gcc.gnu.org">Gcc</a> and
+        <a href="https://software.intel.com/en-us/compilers">Intel's
         existing OpenMP compilers.</a>
         We currently have binary compatibility with OpenMP
         3.1 code compiled by gcc 4.9, however we do not have support
@@ -149,7 +149,7 @@ <h2>Get it and get involved!</h2>
   <!--=====================================================================-->
 
   <p>First please review our
-     <a href="http://llvm.org/docs/DeveloperPolicy.html">Developer's Policy</a>.
+     <a href="https://llvm.org/docs/DeveloperPolicy.html">Developer's Policy</a>.
 
   <p>To check out the code, use:</p>
 
@@ -186,7 +186,7 @@ <h3>Notes</h3>
 </p>
 
   <p>Send discussions to the
-  (<a href="http://lists.llvm.org/mailman/listinfo/openmp-dev">OpenMP mailing list</a>).</p>
+  (<a href="https://lists.llvm.org/mailman/listinfo/openmp-dev">OpenMP mailing list</a>).</p>
 
 
   <!--=====================================================================-->