diff --git a/libc/src/__support/CPP/atomic.h b/libc/src/__support/CPP/atomic.h index 78dc8d2da3c191..155918fba6e077 100644 --- a/libc/src/__support/CPP/atomic.h +++ b/libc/src/__support/CPP/atomic.h @@ -26,6 +26,18 @@ enum class MemoryOrder : int { SEQ_CST = __ATOMIC_SEQ_CST }; +// These are a clang extension, see the clang documenation for more information: +// https://clang.llvm.org/docs/LanguageExtensions.html#scoped-atomic-builtins. +enum class MemoryScope : int { +#if defined(__MEMORY_SCOPE_SYSTEM) && defined(__MEMORY_SCOPE_DEVICE) + SYSTEM = __MEMORY_SCOPE_SYSTEM, + DEVICE = __MEMORY_SCOPE_DEVICE, +#else + SYSTEM = 0, + DEVICE = 0, +#endif +}; + template struct Atomic { // For now, we will restrict to only arithmetic types. static_assert(is_arithmetic_v, "Only arithmetic types can be atomic."); @@ -54,48 +66,82 @@ template struct Atomic { Atomic(const Atomic &) = delete; Atomic &operator=(const Atomic &) = delete; - // Atomic load + // Atomic load. operator T() { return __atomic_load_n(&val, int(MemoryOrder::SEQ_CST)); } - T load(MemoryOrder mem_ord = MemoryOrder::SEQ_CST) { - return __atomic_load_n(&val, int(mem_ord)); + T load(MemoryOrder mem_ord = MemoryOrder::SEQ_CST, + [[maybe_unused]] MemoryScope mem_scope = MemoryScope::SYSTEM) { + if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_load_n)) + return __scoped_atomic_load_n(&val, int(mem_ord), (int)(mem_scope)); + else + return __atomic_load_n(&val, int(mem_ord)); } - // Atomic store + // Atomic store. T operator=(T rhs) { __atomic_store_n(&val, rhs, int(MemoryOrder::SEQ_CST)); return rhs; } - void store(T rhs, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) { - __atomic_store_n(&val, rhs, int(mem_ord)); + void store(T rhs, MemoryOrder mem_ord = MemoryOrder::SEQ_CST, + [[maybe_unused]] MemoryScope mem_scope = MemoryScope::SYSTEM) { + if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_store_n)) + __scoped_atomic_store_n(&val, rhs, int(mem_ord), (int)(mem_scope)); + else + __atomic_store_n(&val, rhs, int(mem_ord)); } // Atomic compare exchange - bool compare_exchange_strong(T &expected, T desired, - MemoryOrder mem_ord = MemoryOrder::SEQ_CST) { + bool compare_exchange_strong( + T &expected, T desired, MemoryOrder mem_ord = MemoryOrder::SEQ_CST, + [[maybe_unused]] MemoryScope mem_scope = MemoryScope::SYSTEM) { return __atomic_compare_exchange_n(&val, &expected, desired, false, int(mem_ord), int(mem_ord)); } - T exchange(T desired, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) { - return __atomic_exchange_n(&val, desired, int(mem_ord)); + T exchange(T desired, MemoryOrder mem_ord = MemoryOrder::SEQ_CST, + [[maybe_unused]] MemoryScope mem_scope = MemoryScope::SYSTEM) { + if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_exchange_n)) + return __scoped_atomic_exchange_n(&val, desired, int(mem_ord), + (int)(mem_scope)); + else + return __atomic_exchange_n(&val, desired, int(mem_ord)); } - T fetch_add(T increment, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) { - return __atomic_fetch_add(&val, increment, int(mem_ord)); + T fetch_add(T increment, MemoryOrder mem_ord = MemoryOrder::SEQ_CST, + [[maybe_unused]] MemoryScope mem_scope = MemoryScope::SYSTEM) { + if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_fetch_add)) + return __scoped_atomic_fetch_add(&val, increment, int(mem_ord), + (int)(mem_scope)); + else + return __atomic_fetch_add(&val, increment, int(mem_ord)); } - T fetch_or(T mask, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) { - return __atomic_fetch_or(&val, mask, int(mem_ord)); + T fetch_or(T mask, MemoryOrder mem_ord = MemoryOrder::SEQ_CST, + [[maybe_unused]] MemoryScope mem_scope = MemoryScope::SYSTEM) { + if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_fetch_or)) + return __scoped_atomic_fetch_or(&val, mask, int(mem_ord), + (int)(mem_scope)); + else + return __atomic_fetch_or(&val, mask, int(mem_ord)); } - T fetch_and(T mask, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) { - return __atomic_fetch_and(&val, mask, int(mem_ord)); + T fetch_and(T mask, MemoryOrder mem_ord = MemoryOrder::SEQ_CST, + [[maybe_unused]] MemoryScope mem_scope = MemoryScope::SYSTEM) { + if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_fetch_and)) + return __scoped_atomic_fetch_and(&val, mask, int(mem_ord), + (int)(mem_scope)); + else + return __atomic_fetch_and(&val, mask, int(mem_ord)); } - T fetch_sub(T decrement, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) { - return __atomic_fetch_sub(&val, decrement, int(mem_ord)); + T fetch_sub(T decrement, MemoryOrder mem_ord = MemoryOrder::SEQ_CST, + [[maybe_unused]] MemoryScope mem_scope = MemoryScope::SYSTEM) { + if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_fetch_sub)) + return __scoped_atomic_fetch_sub(&val, decrement, int(mem_ord), + (int)(mem_scope)); + else + return __atomic_fetch_sub(&val, decrement, int(mem_ord)); } // Set the value without using an atomic operation. This is useful diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h index 08c1dfd10d6d7f..7b2c89ac4dce48 100644 --- a/libc/src/__support/RPC/rpc.h +++ b/libc/src/__support/RPC/rpc.h @@ -109,14 +109,16 @@ template struct Process { /// Retrieve the inbox state from memory shared between processes. LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) const { - return gpu::broadcast_value(lane_mask, - inbox[index].load(cpp::MemoryOrder::RELAXED)); + return gpu::broadcast_value( + lane_mask, + inbox[index].load(cpp::MemoryOrder::RELAXED, cpp::MemoryScope::SYSTEM)); } /// Retrieve the outbox state from memory shared between processes. LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) const { return gpu::broadcast_value(lane_mask, - outbox[index].load(cpp::MemoryOrder::RELAXED)); + outbox[index].load(cpp::MemoryOrder::RELAXED, + cpp::MemoryScope::SYSTEM)); } /// Signal to the other process that this one is finished with the buffer. @@ -126,7 +128,8 @@ template struct Process { LIBC_INLINE uint32_t invert_outbox(uint32_t index, uint32_t current_outbox) { uint32_t inverted_outbox = !current_outbox; atomic_thread_fence(cpp::MemoryOrder::RELEASE); - outbox[index].store(inverted_outbox, cpp::MemoryOrder::RELAXED); + outbox[index].store(inverted_outbox, cpp::MemoryOrder::RELAXED, + cpp::MemoryScope::SYSTEM); return inverted_outbox; } @@ -241,7 +244,8 @@ template struct Process { uint32_t slot = index / NUM_BITS_IN_WORD; uint32_t bit = index % NUM_BITS_IN_WORD; return bits[slot].fetch_or(static_cast(cond) << bit, - cpp::MemoryOrder::RELAXED) & + cpp::MemoryOrder::RELAXED, + cpp::MemoryScope::DEVICE) & (1u << bit); } @@ -251,7 +255,8 @@ template struct Process { uint32_t slot = index / NUM_BITS_IN_WORD; uint32_t bit = index % NUM_BITS_IN_WORD; return bits[slot].fetch_and(~0u ^ (static_cast(cond) << bit), - cpp::MemoryOrder::RELAXED) & + cpp::MemoryOrder::RELAXED, + cpp::MemoryScope::DEVICE) & (1u << bit); } };