diff --git a/build.rs b/build.rs
index d435fcdbf..0fe2b2530 100644
--- a/build.rs
+++ b/build.rs
@@ -146,7 +146,7 @@ fn main() {
             println!("cargo:rustc-cfg=portable_atomic_llvm15");
         }
         if !no_asm
-            && (target_arch == "powerpc64" || target_arch == "s390x")
+            && (target_arch == "powerpc64" || target_arch == "s390x" || target_arch == "nvptx64")
             && is_allowed_feature("asm_experimental_arch")
         {
             println!("cargo:rustc-cfg=portable_atomic_asm_experimental_arch");
diff --git a/src/imp/mod.rs b/src/imp/mod.rs
index 1c43ddf53..32bb56478 100644
--- a/src/imp/mod.rs
+++ b/src/imp/mod.rs
@@ -65,6 +65,11 @@ mod s390x;
 #[cfg(target_arch = "msp430")]
 pub(crate) mod msp430;
 
+#[cfg(portable_atomic_asm_experimental_arch)]
+#[cfg(feature = "float")]
+#[cfg(target_arch = "nvptx64")]
+pub(crate) mod nvptx;
+
 #[cfg_attr(portable_atomic_no_cfg_target_has_atomic, cfg(any(test, portable_atomic_no_atomic_cas)))]
 #[cfg_attr(
     not(portable_atomic_no_cfg_target_has_atomic),
@@ -136,8 +141,13 @@ mod interrupt;
 // Atomic float implementations
 
 #[cfg(feature = "float")]
+#[cfg(not(all(target_arch = "nvptx64", portable_atomic_asm_experimental_arch)))]
 pub(crate) mod float;
 
+#[cfg(feature = "float")]
+#[cfg(all(target_arch = "nvptx64", portable_atomic_asm_experimental_arch))]
+pub(crate) use nvptx as float;
+
 // -----------------------------------------------------------------------------
 
 // Atomic{Isize,Usize,Bool,Ptr}, Atomic{I,U}{8,16}
diff --git a/src/imp/nvptx.rs b/src/imp/nvptx.rs
new file mode 100644
index 000000000..c60e32706
--- /dev/null
+++ b/src/imp/nvptx.rs
@@ -0,0 +1,401 @@
+// Atomic float implementation on NVPTX.
+//
+// Refs:
+// - https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld
+// - https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
+// - https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar
+// - User Guide for NVPTX Back-end (LLVM documentation) https://llvm.org/docs/NVPTXUsage.html
+// - https://github.com/NVIDIA/libcudacxx/blob/1.9.0-rc1/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h
+
+// TODO: fallback on pre-sm_70
+
+use core::{arch::asm, sync::atomic::Ordering};
+
+// NVPTX's seqcst atomic op is preceding seqcst fence + acquire op.
+macro_rules! fence_sc {
+    () => {
+        "fence.sc.gl;"
+    };
+}
+
+macro_rules! atomic_rmw {
+    ($op:ident, $order:ident) => {
+        match $order {
+            Ordering::Relaxed => $op!("relaxed", ""),
+            Ordering::Acquire => $op!("acquire", ""),
+            Ordering::Release => $op!("release", ""),
+            Ordering::AcqRel => $op!("acqrel", ""),
+            Ordering::SeqCst => $op!("acquire", fence_sc!()),
+            _ => unreachable!("{:?}", $order),
+        }
+    };
+}
+
+macro_rules! atomic_float {
+    (
+        $atomic_type:ident, $float_type:ident, $atomic_int_type:ident, $int_type:ident,
+        $val_reg:ident, $align:expr
+    ) => {
+        #[repr(C, align($align))]
+        pub(crate) struct $atomic_type {
+            v: core::cell::UnsafeCell<$float_type>,
+        }
+
+        // Send is implicitly implemented.
+        // SAFETY: any data races are prevented by atomic operations.
+        unsafe impl Sync for $atomic_type {}
+
+        impl $atomic_type {
+            #[inline]
+            pub(crate) const fn new(v: $float_type) -> Self {
+                Self { v: core::cell::UnsafeCell::new(v) }
+            }
+
+            #[inline]
+            pub(crate) fn is_lock_free() -> bool {
+                true
+            }
+            #[inline]
+            pub(crate) const fn is_always_lock_free() -> bool {
+                true
+            }
+
+            #[inline]
+            pub(crate) fn get_mut(&mut self) -> &mut $float_type {
+                // SAFETY: the mutable reference guarantees unique ownership.
+                // (UnsafeCell::get_mut requires Rust 1.50)
+                unsafe { &mut *self.v.get() }
+            }
+
+            #[inline]
+            pub(crate) fn into_inner(self) -> $float_type {
+                self.v.into_inner()
+            }
+
+            #[inline]
+            #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
+            pub(crate) fn load(&self, order: Ordering) -> $float_type {
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe { $float_type::atomic_load(self.v.get(), order) }
+            }
+
+            #[inline]
+            #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
+            pub(crate) fn store(&self, val: $float_type, order: Ordering) {
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe { $float_type::atomic_store(self.v.get(), val, order) }
+            }
+
+            #[inline]
+            pub(crate) fn swap(&self, val: $float_type, order: Ordering) -> $float_type {
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe { $float_type::atomic_swap(self.v.get(), val, order) }
+            }
+
+            #[inline]
+            #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
+            pub(crate) fn compare_exchange(
+                &self,
+                current: $float_type,
+                new: $float_type,
+                success: Ordering,
+                failure: Ordering,
+            ) -> Result<$float_type, $float_type> {
+                let order = crate::utils::upgrade_success_ordering(success, failure);
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                let res = unsafe {
+                    $float_type::atomic_compare_exchange(self.v.get(), current, new, order)
+                };
+                if res.to_bits() == current.to_bits() {
+                    Ok(res)
+                } else {
+                    Err(res)
+                }
+            }
+
+            #[inline]
+            #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
+            pub(crate) fn compare_exchange_weak(
+                &self,
+                current: $float_type,
+                new: $float_type,
+                success: Ordering,
+                failure: Ordering,
+            ) -> Result<$float_type, $float_type> {
+                self.compare_exchange(current, new, success, failure)
+            }
+
+            #[inline]
+            pub(crate) fn fetch_add(&self, val: $float_type, order: Ordering) -> $float_type {
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe { $float_type::atomic_add(self.v.get(), val, order) }
+            }
+
+            #[inline]
+            pub(crate) fn fetch_sub(&self, val: $float_type, order: Ordering) -> $float_type {
+                // There is not atom.sub,
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe { $float_type::atomic_add(self.v.get(), -val, order) }
+            }
+
+            #[inline]
+            pub(crate) fn fetch_max(&self, val: $float_type, order: Ordering) -> $float_type {
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe { $float_type::atomic_max(self.v.get(), val, order) }
+            }
+
+            #[inline]
+            pub(crate) fn fetch_min(&self, val: $float_type, order: Ordering) -> $float_type {
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe { $float_type::atomic_min(self.v.get(), val, order) }
+            }
+
+            #[inline]
+            pub(crate) fn fetch_abs(&self, order: Ordering) -> $float_type {
+                const ABS_MASK: $int_type = !0 / 2;
+                $float_type::from_bits(self.as_bits().fetch_and(ABS_MASK, order))
+            }
+
+            #[inline]
+            pub(crate) fn as_bits(&self) -> &crate::$atomic_int_type {
+                // SAFETY: $atomic_type and $atomic_int_type have the same layout,
+                // and there is no concurrent access to the value that does not go through this method.
+                unsafe { &*(self as *const $atomic_type as *const crate::$atomic_int_type) }
+            }
+        }
+
+        impl AtomicOperations for $float_type {
+            unsafe fn atomic_load(src: *mut Self, order: Ordering) -> Self {
+                let out;
+                // SAFETY: the caller must uphold the safety contract for `atomic_load`.
+                unsafe {
+                    macro_rules! atomic_load {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("ld.", $sem, ".gpu.", stringify!($float_type), " {out}, [{src}];"),
+                                src = in(reg64) src,
+                                out = out($val_reg) out,
+                            )
+                        };
+                    }
+                    match order {
+                        Ordering::Relaxed => atomic_load!("relaxed", ""),
+                        Ordering::Acquire => atomic_load!("acquire", ""),
+                        Ordering::SeqCst => atomic_load!("acquire", fence_sc!()),
+                        _ => unreachable!("{:?}", order),
+                    }
+                }
+                out
+            }
+            unsafe fn atomic_store(dst: *mut Self, val: Self, order: Ordering) {
+                // SAFETY: the caller must uphold the safety contract for `atomic_store`.
+                unsafe {
+                    macro_rules! atomic_store {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("st.", $sem, ".gpu.", stringify!($float_type), " [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                            )
+                        };
+                    }
+                    match order {
+                        Ordering::Relaxed => atomic_store!("relaxed", ""),
+                        Ordering::Release => atomic_store!("release", ""),
+                        Ordering::SeqCst => atomic_store!("relaxed", fence_sc!()),
+                        _ => unreachable!("{:?}", order),
+                    }
+                }
+            }
+            unsafe fn atomic_swap(dst: *mut Self, val: Self, order: Ordering) -> Self {
+                let out;
+                // SAFETY: the caller must uphold the safety contract for `atomic_swap`.
+                unsafe {
+                    macro_rules! swap {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.exch.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                            )
+                        };
+                    }
+                    atomic_rmw!(swap, order);
+                }
+                out
+            }
+            unsafe fn atomic_compare_exchange(
+                dst: *mut Self,
+                old: Self,
+                new: Self,
+                order: Ordering,
+            ) -> Self {
+                let out;
+                // SAFETY: the caller must uphold the safety contract for `atomic_compare_exchange`.
+                unsafe {
+                    macro_rules! cmpxchg {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.cas.", stringify!($float_type), " {out}, [{dst}], {old}, {new};"),
+                                dst = in(reg64) dst,
+                                old = in($val_reg) old,
+                                new = in($val_reg) new,
+                                out = out($val_reg) out,
+                            )
+                        };
+                    }
+                    atomic_rmw!(cmpxchg, order);
+                }
+                out
+            }
+            unsafe fn atomic_and(dst: *mut Self, val: Self, order: Ordering) -> Self {
+                let out;
+                // SAFETY: the caller must uphold the safety contract for `atomic_and`.
+                unsafe {
+                    macro_rules! and {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.and.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                            )
+                        };
+                    }
+                    atomic_rmw!(and, order);
+                }
+                out
+            }
+            unsafe fn atomic_or(dst: *mut Self, val: Self, order: Ordering) -> Self {
+                let out;
+                // SAFETY: the caller must uphold the safety contract for `atomic_or`.
+                unsafe {
+                    macro_rules! or {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.or.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                            )
+                        };
+                    }
+                    atomic_rmw!(or, order);
+                }
+                out
+            }
+            unsafe fn atomic_xor(dst: *mut Self, val: Self, order: Ordering) -> Self {
+                let out;
+                // SAFETY: the caller must uphold the safety contract for `atomic_xor`.
+                unsafe {
+                    macro_rules! xor {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.xor.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                            )
+                        };
+                    }
+                    atomic_rmw!(xor, order);
+                }
+                out
+            }
+            unsafe fn atomic_add(dst: *mut Self, val: Self, order: Ordering) -> Self {
+                let out;
+                // SAFETY: the caller must uphold the safety contract for `atomic_add`.
+                unsafe {
+                    macro_rules! add {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.add.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                            )
+                        };
+                    }
+                    atomic_rmw!(add, order);
+                }
+                out
+            }
+            unsafe fn atomic_min(dst: *mut Self, val: Self, order: Ordering) -> Self {
+                let out;
+                // SAFETY: the caller must uphold the safety contract for `atomic_min`.
+                unsafe {
+                    macro_rules! min {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.min.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                            )
+                        };
+                    }
+                    atomic_rmw!(min, order);
+                }
+                out
+            }
+            unsafe fn atomic_max(dst: *mut Self, val: Self, order: Ordering) -> Self {
+                let out;
+                // SAFETY: the caller must uphold the safety contract for `atomic_max`.
+                unsafe {
+                    macro_rules! max {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.max.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                            )
+                        };
+                    }
+                    atomic_rmw!(max, order);
+                }
+                out
+            }
+        }
+    };
+}
+
+trait AtomicOperations: Sized {
+    unsafe fn atomic_load(src: *mut Self, order: Ordering) -> Self;
+    unsafe fn atomic_store(dst: *mut Self, val: Self, order: Ordering);
+    unsafe fn atomic_swap(dst: *mut Self, val: Self, order: Ordering) -> Self;
+    unsafe fn atomic_compare_exchange(
+        dst: *mut Self,
+        old: Self,
+        new: Self,
+        order: Ordering,
+    ) -> Self;
+    unsafe fn atomic_add(dst: *mut Self, val: Self, order: Ordering) -> Self;
+    unsafe fn atomic_and(dst: *mut Self, val: Self, order: Ordering) -> Self;
+    unsafe fn atomic_or(dst: *mut Self, val: Self, order: Ordering) -> Self;
+    unsafe fn atomic_xor(dst: *mut Self, val: Self, order: Ordering) -> Self;
+    unsafe fn atomic_min(dst: *mut Self, val: Self, order: Ordering) -> Self;
+    unsafe fn atomic_max(dst: *mut Self, val: Self, order: Ordering) -> Self;
+}
+
+atomic_float!(AtomicF32, f32, AtomicU32, u32, reg32, 4);
+atomic_float!(AtomicF64, f64, AtomicU64, u64, reg64, 8);
diff --git a/src/lib.rs b/src/lib.rs
index 176060434..084e39f73 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -183,7 +183,7 @@ See [this list](https://github.com/taiki-e/portable-atomic/issues/10#issuecommen
 )]
 // asm_experimental_arch
 // AVR and MSP430 are tier 3 platforms and require nightly anyway.
-// On tier 2 platforms (powerpc64 and s390x), we use cfg set by build script to
+// On tier 2 platforms (powerpc64, s390x, nvptx64), we use cfg set by build script to
 // determine whether this feature is available or not.
 #![cfg_attr(
     all(
@@ -201,6 +201,7 @@ See [this list](https://github.com/taiki-e/portable-atomic/issues/10#issuecommen
                 )
             ),
             all(portable_atomic_asm_experimental_arch, target_arch = "s390x"),
+            all(portable_atomic_asm_experimental_arch, target_arch = "nvptx64"),
         ),
     ),
     feature(asm_experimental_arch)
diff --git a/tools/build.sh b/tools/build.sh
index 7a9375cd4..8405df9ad 100755
--- a/tools/build.sh
+++ b/tools/build.sh
@@ -46,6 +46,8 @@ default_targets=(
     # riscv32 with atomic
     riscv32imac-unknown-none-elf
     riscv32imc-esp-espidf
+    # nvptx64
+    nvptx64-nvidia-cuda
 
     # tier 1 targets
     aarch64-unknown-linux-gnu